From 01045be6891ce469785791320e54422ee7fb6a33 Mon Sep 17 00:00:00 2001 From: Arpad Borsos Date: Wed, 16 Dec 2020 16:05:46 +0100 Subject: [PATCH 01/13] feat: Implement Session Stats API This is a new API for querying sessions and has a query interface similar to discover, and returns timeseries data. --- .../api/endpoints/organization_sessions.py | 327 ++++++++++++++++++ src/sentry/api/urls.py | 6 + 2 files changed, 333 insertions(+) create mode 100644 src/sentry/api/endpoints/organization_sessions.py diff --git a/src/sentry/api/endpoints/organization_sessions.py b/src/sentry/api/endpoints/organization_sessions.py new file mode 100644 index 00000000000000..1647c4c9114edc --- /dev/null +++ b/src/sentry/api/endpoints/organization_sessions.py @@ -0,0 +1,327 @@ +from __future__ import absolute_import + +from datetime import datetime +from rest_framework.response import Response + +import six +import sentry_sdk + +from sentry.api.bases import OrganizationEventsV2EndpointBase +from sentry.api.event_search import get_filter, InvalidSearchQuery +from sentry.utils.compat import filter, map +from sentry.utils.dates import get_rollup_from_request +from sentry.utils.snuba import Dataset, raw_query, to_naive_timestamp, naiveify_datetime + + +class ColumnDefinition(object): + """ + This defines the column mapping from a discover-like `name` to the + `snuba_columns` that feed into it. + + An `extractor` function can be given that transforms the raw data columns + into the expected output. By default, it will just use the single `snuba_columns`. + + A `default` must also be provided which is used when the row does not contain + any data. + """ + + def __init__(self, name, snuba_columns, default, extractor=None): + self.name = name + self.snuba_columns = snuba_columns + self.default = default + self.extractor = extractor + + def extract(self, row): + if row is None: + return self.default + + if self.extractor is not None: + value = self.extractor(row) + elif len(self.snuba_columns) == 1: + value = row[self.snuba_columns[0]] + else: + return self.default + + return value if value is not None else self.default + + +# Lets assume we have a recent enough snuba. +# TODO: Also, maybe we can run a custom aggregation over the `duration_quantiles`? +QUANTILE_MAP = {50: 0, 75: 1, 90: 2, 95: 3, 99: 4, 100: 5} + + +def extract_quantile(num): + def inner(row): + return row["duration_quantiles"][QUANTILE_MAP[num]] + + return inner + + +COLUMNS = [ + ColumnDefinition("sum(session)", ["sessions"], 0), + ColumnDefinition( + "sum(session.healthy)", + ["sessions", "sessions_errored"], + 0, + lambda row: row["sessions"] - row["sessions_errored"], + ), + ColumnDefinition("sum(session.errored)", ["sessions_errored"], 0), + ColumnDefinition("sum(session.abnormal)", ["sessions_abnormal"], 0), + ColumnDefinition("sum(session.crashed)", ["sessions_crashed"], 0), + ColumnDefinition("count_unique(user)", ["users"], 0), + ColumnDefinition( + "count_unique(user.healthy)", + ["users", "users_errored"], + 0, + lambda row: row["users"] - row["users_errored"], + ), + ColumnDefinition("count_unique(user.errored)", ["users_errored"], 0), + ColumnDefinition("count_unique(user.abnormal)", ["users_abnormal"], 0), + ColumnDefinition("count_unique(user.crashed)", ["users_crashed"], 0), + ColumnDefinition("p50(session.duration)", ["duration_quantiles"], None, extract_quantile(50)), + ColumnDefinition("p75(session.duration)", ["duration_quantiles"], None, extract_quantile(75)), + ColumnDefinition("p90(session.duration)", ["duration_quantiles"], None, extract_quantile(90)), + ColumnDefinition("p95(session.duration)", ["duration_quantiles"], None, extract_quantile(95)), + ColumnDefinition("p99(session.duration)", ["duration_quantiles"], None, extract_quantile(99)), + ColumnDefinition("max(session.duration)", ["duration_quantiles"], None, extract_quantile(100)), + ColumnDefinition("avg(session.duration)", ["duration_avg"], None), + ColumnDefinition("release", ["release"], ""), + ColumnDefinition("environment", ["environment"], ""), + ColumnDefinition("user_agent", ["user_agent"], ""), + ColumnDefinition("os", ["os"], ""), +] + +COLUMN_MAP = {column.name: column for column in COLUMNS} + + +class QueryDefinition(object): + """ + This is the definition of the query the user wants to execute. + This is constructed out of the request params, and also contains a list of + `fields` and `groupby` definitions as [`ColumnDefinition`] objects. + """ + + def __init__(self, request, params): + # self.request = request + # self.params = params + + self.query = request.GET.get("query") + raw_fields = request.GET.getlist("field", []) + raw_groupby = request.GET.getlist("groupBy", []) + + # TODO: maybe show a proper error message for unknown fields/groupby + self.fields = filter(None, (COLUMN_MAP.get(field) for field in raw_fields)) + self.groupby = filter(None, (COLUMN_MAP.get(field) for field in raw_groupby)) + + rollup = get_rollup_from_request( + request, + params, + "1h", + InvalidSearchQuery( + "Your interval and date range would create too many results. " + "Use a larger interval, or a smaller date range." + ), + ) + # The minimum interval is one hour on the server + self.rollup = max(rollup, 3600) + + def extract_columns(lists): + columns = set() + for l in lists: + for field in l: + for column in field.snuba_columns: + columns.add(column) + return list(columns) + + self.query_columns = extract_columns([self.fields, self.groupby]) + self.query_groupby = extract_columns([self.groupby]) + + snuba_filter = get_timeseries_snuba_filter( + self.query_columns, self.query, params, self.rollup + ) + + self.start = snuba_filter.start + self.end = snuba_filter.end + self.aggregations = snuba_filter.aggregations + self.conditions = snuba_filter.conditions + + self.filter_keys = snuba_filter.filter_keys + + +TS_COL = "bucketed_started" + + +class OrganizationSessionsEndpoint(OrganizationEventsV2EndpointBase): + def get(self, request, organization): + # with self.handle_query_errors(): + query = self.build_sessions_query(request, organization) + result_totals, result_timeseries = run_sessions_query(query) + result = massage_sessions_result(query, result_totals, result_timeseries) + return Response(result, status=200) + + def build_sessions_query(self, request, organization): + with sentry_sdk.start_span(op="sessions.endpoint", description="build_sessions_query"): + params = self.get_snuba_params(request, organization, check_global_views=False) + + return QueryDefinition(request, params) + + +def get_timeseries_snuba_filter(selected_columns, query, params, rollup, default_count=True): + snuba_filter = get_filter(query, params) + if not snuba_filter.start and not snuba_filter.end: + raise InvalidSearchQuery("Cannot get timeseries result without a start and end.") + + return snuba_filter + + +def get_timestamps(query): + """ + Generates a list of timestamps according to `query`. + The timestamps are printed as strings. + """ + rollup = query.rollup + start = int(to_naive_timestamp(naiveify_datetime(query.start)) / rollup) * rollup + end = (int(to_naive_timestamp(naiveify_datetime(query.end)) / rollup) * rollup) + rollup + return [ + datetime.utcfromtimestamp(ts).isoformat() for ts in six.moves.xrange(start, end, rollup) + ] + + +def run_sessions_query(query): + """ + Runs the `query` as defined by [`QueryDefinition`] two times, once for the + `totals` and again for the actual time-series data grouped by the requested + interval. + """ + with sentry_sdk.start_span(op="sessions.discover", description="run_sessions_query"): + result_totals = raw_query( + dataset=Dataset.Sessions, + selected_columns=query.query_columns, + groupby=query.query_groupby, + aggregations=query.aggregations, + conditions=query.conditions, + filter_keys=query.filter_keys, + start=query.start, + end=query.end, + rollup=query.rollup, + referrer="sessions.totals", + ) + + result_timeseries = raw_query( + dataset=Dataset.Sessions, + selected_columns=[TS_COL] + query.query_columns, + groupby=[TS_COL] + query.query_groupby, + aggregations=query.aggregations, + conditions=query.conditions, + filter_keys=query.filter_keys, + start=query.start, + end=query.end, + rollup=query.rollup, + referrer="sessions.timeseries", + ) + + return result_totals, result_timeseries + + +def sane_groupby(it, keyfn): + """ + Basically the same as `itertools.groupby`, but without the requirement to + have the iterable sorted already by the keys, which can be super confusing + and breaks in surprising ways. + """ + groups = {} + for elem in it: + key = keyfn(elem) + if key not in groups: + groups[key] = [] + groups[key].append(elem) + + return groups + + +def massage_sessions_result(query, result_totals, result_timeseries): + """ + Post-processes the query result. + + Given the `query` as defined by [`QueryDefinition`] and its totals and + timeseries results from snuba, groups and transforms the result into the + expected format. + + For example: + ```json + { + "intervals": [ + "2020-12-16T00:00:00Z", + "2020-12-16T12:00:00Z", + "2020-12-17T00:00:00Z" + ], + "groups": [ + { + "by": { "release": "99b8edc5a3bb49d01d16426d7bb9c511ec41f81e" }, + "series": { "sum(session)": [0, 1, 0] }, + "totals": { "sum(session)": 1 } + }, + { + "by": { "release": "test-example-release" }, + "series": { "sum(session)": [0, 10, 20] }, + "totals": { "sum(session)": 30 } + } + ] + } + ``` + """ + with sentry_sdk.start_span(op="sessions.discover", description="massage_sessions_result"): + timestamps = get_timestamps(query) + + def group_fn(row): + return tuple(field.extract(row) for field in query.groupby) + + total_groups = sane_groupby(result_totals["data"], group_fn) + timeseries_groups = sane_groupby(result_timeseries["data"], group_fn) + + def make_timeseries(group): + for row in group: + row[TS_COL] = row[TS_COL][:19] + + group.sort(key=lambda row: row[TS_COL]) + fields = [(field, list()) for field in query.fields] + group_index = 0 + + while group_index < len(group): + row = group[group_index] + if row[TS_COL] < timestamps[0]: + group_index += 1 + else: + break + + for ts in timestamps: + row = group[group_index] if group_index < len(group) else None + if row is not None and row[TS_COL] == ts: + group_index += 1 + else: + row = None + + for (field, series) in fields: + series.append(field.extract(row)) + + return {field.name: series for (field, series) in fields} + + def make_totals(totals): + return {field.name: field.extract(totals[0]) for field in query.fields} + + groups = [ + { + "by": {field.name: key[i] for i, field in enumerate(query.groupby)}, + "totals": make_totals(totals), + "series": make_timeseries(timeseries_groups[key]), + } + for key, totals in total_groups.items() + ] + + return { + # "query": query.query, + "intervals": map(lambda ts: ts + "Z", timestamps), + "groups": groups, + # "raw_timeseries": result_timeseries["data"], + # "raw_totals": result_totals["data"], + } diff --git a/src/sentry/api/urls.py b/src/sentry/api/urls.py index 844d90dd79324d..103d2c72adeeac 100644 --- a/src/sentry/api/urls.py +++ b/src/sentry/api/urls.py @@ -167,6 +167,7 @@ from .endpoints.organization_join_request import OrganizationJoinRequestEndpoint from .endpoints.organization_search_details import OrganizationSearchDetailsEndpoint from .endpoints.organization_searches import OrganizationSearchesEndpoint +from .endpoints.organization_sessions import OrganizationSessionsEndpoint from .endpoints.organization_sentry_apps import OrganizationSentryAppsEndpoint from .endpoints.organization_shortid import ShortIdLookupEndpoint from .endpoints.organization_slugs import SlugsUpdateEndpoint @@ -944,6 +945,11 @@ OrganizationSearchesEndpoint.as_view(), name="sentry-api-0-organization-searches", ), + url( + r"^(?P[^\/]+)/sessions/$", + OrganizationSessionsEndpoint.as_view(), + name="sentry-api-0-organization-sessions", + ), url( r"^(?P[^\/]+)/users/issues/$", OrganizationUserIssuesSearchEndpoint.as_view(), From 74c493c65870b0f49f314fb7aa05fc9d01b12138 Mon Sep 17 00:00:00 2001 From: Arpad Borsos Date: Fri, 18 Dec 2020 12:43:35 +0100 Subject: [PATCH 02/13] start unit testing the internal sessions API --- .../api/endpoints/organization_sessions.py | 320 +-------------- src/sentry/snuba/sessions_v2.py | 386 ++++++++++++++++++ tests/snuba/sessions/test_sessions_v2.py | 113 +++++ 3 files changed, 512 insertions(+), 307 deletions(-) create mode 100644 src/sentry/snuba/sessions_v2.py create mode 100644 tests/snuba/sessions/test_sessions_v2.py diff --git a/src/sentry/api/endpoints/organization_sessions.py b/src/sentry/api/endpoints/organization_sessions.py index 1647c4c9114edc..5e8bea0592eae7 100644 --- a/src/sentry/api/endpoints/organization_sessions.py +++ b/src/sentry/api/endpoints/organization_sessions.py @@ -1,162 +1,29 @@ from __future__ import absolute_import -from datetime import datetime from rest_framework.response import Response -import six import sentry_sdk from sentry.api.bases import OrganizationEventsV2EndpointBase -from sentry.api.event_search import get_filter, InvalidSearchQuery -from sentry.utils.compat import filter, map -from sentry.utils.dates import get_rollup_from_request -from sentry.utils.snuba import Dataset, raw_query, to_naive_timestamp, naiveify_datetime - - -class ColumnDefinition(object): - """ - This defines the column mapping from a discover-like `name` to the - `snuba_columns` that feed into it. - - An `extractor` function can be given that transforms the raw data columns - into the expected output. By default, it will just use the single `snuba_columns`. - - A `default` must also be provided which is used when the row does not contain - any data. - """ - - def __init__(self, name, snuba_columns, default, extractor=None): - self.name = name - self.snuba_columns = snuba_columns - self.default = default - self.extractor = extractor - - def extract(self, row): - if row is None: - return self.default - - if self.extractor is not None: - value = self.extractor(row) - elif len(self.snuba_columns) == 1: - value = row[self.snuba_columns[0]] - else: - return self.default - - return value if value is not None else self.default - - -# Lets assume we have a recent enough snuba. -# TODO: Also, maybe we can run a custom aggregation over the `duration_quantiles`? -QUANTILE_MAP = {50: 0, 75: 1, 90: 2, 95: 3, 99: 4, 100: 5} - - -def extract_quantile(num): - def inner(row): - return row["duration_quantiles"][QUANTILE_MAP[num]] - - return inner - - -COLUMNS = [ - ColumnDefinition("sum(session)", ["sessions"], 0), - ColumnDefinition( - "sum(session.healthy)", - ["sessions", "sessions_errored"], - 0, - lambda row: row["sessions"] - row["sessions_errored"], - ), - ColumnDefinition("sum(session.errored)", ["sessions_errored"], 0), - ColumnDefinition("sum(session.abnormal)", ["sessions_abnormal"], 0), - ColumnDefinition("sum(session.crashed)", ["sessions_crashed"], 0), - ColumnDefinition("count_unique(user)", ["users"], 0), - ColumnDefinition( - "count_unique(user.healthy)", - ["users", "users_errored"], - 0, - lambda row: row["users"] - row["users_errored"], - ), - ColumnDefinition("count_unique(user.errored)", ["users_errored"], 0), - ColumnDefinition("count_unique(user.abnormal)", ["users_abnormal"], 0), - ColumnDefinition("count_unique(user.crashed)", ["users_crashed"], 0), - ColumnDefinition("p50(session.duration)", ["duration_quantiles"], None, extract_quantile(50)), - ColumnDefinition("p75(session.duration)", ["duration_quantiles"], None, extract_quantile(75)), - ColumnDefinition("p90(session.duration)", ["duration_quantiles"], None, extract_quantile(90)), - ColumnDefinition("p95(session.duration)", ["duration_quantiles"], None, extract_quantile(95)), - ColumnDefinition("p99(session.duration)", ["duration_quantiles"], None, extract_quantile(99)), - ColumnDefinition("max(session.duration)", ["duration_quantiles"], None, extract_quantile(100)), - ColumnDefinition("avg(session.duration)", ["duration_avg"], None), - ColumnDefinition("release", ["release"], ""), - ColumnDefinition("environment", ["environment"], ""), - ColumnDefinition("user_agent", ["user_agent"], ""), - ColumnDefinition("os", ["os"], ""), -] - -COLUMN_MAP = {column.name: column for column in COLUMNS} - - -class QueryDefinition(object): - """ - This is the definition of the query the user wants to execute. - This is constructed out of the request params, and also contains a list of - `fields` and `groupby` definitions as [`ColumnDefinition`] objects. - """ - - def __init__(self, request, params): - # self.request = request - # self.params = params - - self.query = request.GET.get("query") - raw_fields = request.GET.getlist("field", []) - raw_groupby = request.GET.getlist("groupBy", []) - - # TODO: maybe show a proper error message for unknown fields/groupby - self.fields = filter(None, (COLUMN_MAP.get(field) for field in raw_fields)) - self.groupby = filter(None, (COLUMN_MAP.get(field) for field in raw_groupby)) - - rollup = get_rollup_from_request( - request, - params, - "1h", - InvalidSearchQuery( - "Your interval and date range would create too many results. " - "Use a larger interval, or a smaller date range." - ), - ) - # The minimum interval is one hour on the server - self.rollup = max(rollup, 3600) - - def extract_columns(lists): - columns = set() - for l in lists: - for field in l: - for column in field.snuba_columns: - columns.add(column) - return list(columns) - - self.query_columns = extract_columns([self.fields, self.groupby]) - self.query_groupby = extract_columns([self.groupby]) - - snuba_filter = get_timeseries_snuba_filter( - self.query_columns, self.query, params, self.rollup - ) - - self.start = snuba_filter.start - self.end = snuba_filter.end - self.aggregations = snuba_filter.aggregations - self.conditions = snuba_filter.conditions - - self.filter_keys = snuba_filter.filter_keys - - -TS_COL = "bucketed_started" +from sentry.snuba.sessions_v2 import ( + QueryDefinition, + run_sessions_query, + massage_sessions_result, +) class OrganizationSessionsEndpoint(OrganizationEventsV2EndpointBase): def get(self, request, organization): # with self.handle_query_errors(): query = self.build_sessions_query(request, organization) - result_totals, result_timeseries = run_sessions_query(query) - result = massage_sessions_result(query, result_totals, result_timeseries) + + with sentry_sdk.start_span(op="sessions.endpoint", description="run_sessions_query"): + result_totals, result_timeseries = run_sessions_query(query) + + # print(result_totals, result_timeseries) + + with sentry_sdk.start_span(op="sessions.endpoint", description="massage_sessions_result"): + result = massage_sessions_result(query, result_totals, result_timeseries) return Response(result, status=200) def build_sessions_query(self, request, organization): @@ -164,164 +31,3 @@ def build_sessions_query(self, request, organization): params = self.get_snuba_params(request, organization, check_global_views=False) return QueryDefinition(request, params) - - -def get_timeseries_snuba_filter(selected_columns, query, params, rollup, default_count=True): - snuba_filter = get_filter(query, params) - if not snuba_filter.start and not snuba_filter.end: - raise InvalidSearchQuery("Cannot get timeseries result without a start and end.") - - return snuba_filter - - -def get_timestamps(query): - """ - Generates a list of timestamps according to `query`. - The timestamps are printed as strings. - """ - rollup = query.rollup - start = int(to_naive_timestamp(naiveify_datetime(query.start)) / rollup) * rollup - end = (int(to_naive_timestamp(naiveify_datetime(query.end)) / rollup) * rollup) + rollup - return [ - datetime.utcfromtimestamp(ts).isoformat() for ts in six.moves.xrange(start, end, rollup) - ] - - -def run_sessions_query(query): - """ - Runs the `query` as defined by [`QueryDefinition`] two times, once for the - `totals` and again for the actual time-series data grouped by the requested - interval. - """ - with sentry_sdk.start_span(op="sessions.discover", description="run_sessions_query"): - result_totals = raw_query( - dataset=Dataset.Sessions, - selected_columns=query.query_columns, - groupby=query.query_groupby, - aggregations=query.aggregations, - conditions=query.conditions, - filter_keys=query.filter_keys, - start=query.start, - end=query.end, - rollup=query.rollup, - referrer="sessions.totals", - ) - - result_timeseries = raw_query( - dataset=Dataset.Sessions, - selected_columns=[TS_COL] + query.query_columns, - groupby=[TS_COL] + query.query_groupby, - aggregations=query.aggregations, - conditions=query.conditions, - filter_keys=query.filter_keys, - start=query.start, - end=query.end, - rollup=query.rollup, - referrer="sessions.timeseries", - ) - - return result_totals, result_timeseries - - -def sane_groupby(it, keyfn): - """ - Basically the same as `itertools.groupby`, but without the requirement to - have the iterable sorted already by the keys, which can be super confusing - and breaks in surprising ways. - """ - groups = {} - for elem in it: - key = keyfn(elem) - if key not in groups: - groups[key] = [] - groups[key].append(elem) - - return groups - - -def massage_sessions_result(query, result_totals, result_timeseries): - """ - Post-processes the query result. - - Given the `query` as defined by [`QueryDefinition`] and its totals and - timeseries results from snuba, groups and transforms the result into the - expected format. - - For example: - ```json - { - "intervals": [ - "2020-12-16T00:00:00Z", - "2020-12-16T12:00:00Z", - "2020-12-17T00:00:00Z" - ], - "groups": [ - { - "by": { "release": "99b8edc5a3bb49d01d16426d7bb9c511ec41f81e" }, - "series": { "sum(session)": [0, 1, 0] }, - "totals": { "sum(session)": 1 } - }, - { - "by": { "release": "test-example-release" }, - "series": { "sum(session)": [0, 10, 20] }, - "totals": { "sum(session)": 30 } - } - ] - } - ``` - """ - with sentry_sdk.start_span(op="sessions.discover", description="massage_sessions_result"): - timestamps = get_timestamps(query) - - def group_fn(row): - return tuple(field.extract(row) for field in query.groupby) - - total_groups = sane_groupby(result_totals["data"], group_fn) - timeseries_groups = sane_groupby(result_timeseries["data"], group_fn) - - def make_timeseries(group): - for row in group: - row[TS_COL] = row[TS_COL][:19] - - group.sort(key=lambda row: row[TS_COL]) - fields = [(field, list()) for field in query.fields] - group_index = 0 - - while group_index < len(group): - row = group[group_index] - if row[TS_COL] < timestamps[0]: - group_index += 1 - else: - break - - for ts in timestamps: - row = group[group_index] if group_index < len(group) else None - if row is not None and row[TS_COL] == ts: - group_index += 1 - else: - row = None - - for (field, series) in fields: - series.append(field.extract(row)) - - return {field.name: series for (field, series) in fields} - - def make_totals(totals): - return {field.name: field.extract(totals[0]) for field in query.fields} - - groups = [ - { - "by": {field.name: key[i] for i, field in enumerate(query.groupby)}, - "totals": make_totals(totals), - "series": make_timeseries(timeseries_groups[key]), - } - for key, totals in total_groups.items() - ] - - return { - # "query": query.query, - "intervals": map(lambda ts: ts + "Z", timestamps), - "groups": groups, - # "raw_timeseries": result_timeseries["data"], - # "raw_totals": result_totals["data"], - } diff --git a/src/sentry/snuba/sessions_v2.py b/src/sentry/snuba/sessions_v2.py new file mode 100644 index 00000000000000..091033bfbe1a33 --- /dev/null +++ b/src/sentry/snuba/sessions_v2.py @@ -0,0 +1,386 @@ +from __future__ import absolute_import + +from datetime import datetime + +import six + +from sentry.api.event_search import get_filter, InvalidSearchQuery +from sentry.api.utils import get_date_range_from_params +from sentry.utils.compat import filter +from sentry.utils.dates import get_rollup_from_request +from sentry.utils.snuba import Dataset, raw_query, to_naive_timestamp, naiveify_datetime + +""" +The new Sessions API defines a "metrics"-like interface which is can be used in +a similar way to "discover". +See https://www.notion.so/sentry/Session-Stats-API-0016d3713d1a4276be0396a338c7930a + +# "Metrics" + +We have basically 3 "metrics" that we can query: + +- `session` (counter): The number of sessions that occurred +- `user` (set): The set of `dinstinct_id`s. +- `session.duration` (histogram): The duration of individual sessions + (not available for pre-aggregated sessions) + +# "Operations" on metrics + +Depending on the metric *type*, we can query different things: + +- `counter`: Can only be accessed via the `sum` function. +- `set`: Can only be accessed via the `count_unique` function. +- `histogram`: Can have different quantiles / averages available via: + `avg`, `p50`...`p99`, `max`. + +# Tags / Grouping + +The Session data can be grouped by a set of tags, which can only appear in the +`groupBy` of the query. + +- `environment` +- `release`: + TODO: describe specific release filters such as `release.major`, etc + +## "Virtual" tags + +The `session.status` is considered a "virtual" tag, as it does not appear as +such in the current session dataset. Instead the status is represented as +different columns in dataset, and it is "exploded" into distinct groups purely +in code, which is the tricky part. + +Essentially, a Row like this: +``` +{ + sessions: 123 + sessions_abnormal: 4, + sessions_crashed: 3, + sessions_errored: 23, +} +``` + +Is then "exploded" into something like: + +``` +[{ + by: { "session.status": "healthy" }, + series: { + "sum(session)": [100, ...] <- this is `sessions - sessions_errored` + } +}, { + by: { "session.status": "errored" }, + series: { + "sum(session)": [23, ...] + } +}, +... +] +``` +""" + + +class ColumnDefinition(object): + """ + This defines the column mapping from a discover-like `name` to the + `snuba_columns` that feed into it. + + An `extractor` function can be given that transforms the raw data columns + into the expected output. By default, it will just use the single `snuba_columns`. + + A `default` must also be provided which is used when the row does not contain + any data. + """ + + def __init__(self, name, snuba_columns, default, extractor=None): + self.name = name + self.snuba_columns = snuba_columns + self.default = default + self.extractor = extractor + + def extract(self, row): + if row is None: + return self.default + + if self.extractor is not None: + value = self.extractor(row) + elif len(self.snuba_columns) == 1: + value = row[self.snuba_columns[0]] + else: + return self.default + + return value if value is not None else self.default + + +# Lets assume we have a recent enough snuba. +# TODO: Also, maybe we can run a custom aggregation over the `duration_quantiles`? +QUANTILE_MAP = {50: 0, 75: 1, 90: 2, 95: 3, 99: 4, 100: 5} + + +def extract_quantile(num): + def inner(row): + return row["duration_quantiles"][QUANTILE_MAP[num]] + + return inner + + +COLUMNS = [ + ColumnDefinition("sum(session)", ["sessions"], 0), + ColumnDefinition( + "sum(session.healthy)", + ["sessions", "sessions_errored"], + 0, + lambda row: row["sessions"] - row["sessions_errored"], + ), + ColumnDefinition("sum(session.errored)", ["sessions_errored"], 0), + ColumnDefinition("sum(session.abnormal)", ["sessions_abnormal"], 0), + ColumnDefinition("sum(session.crashed)", ["sessions_crashed"], 0), + ColumnDefinition("count_unique(user)", ["users"], 0), + ColumnDefinition( + "count_unique(user.healthy)", + ["users", "users_errored"], + 0, + lambda row: row["users"] - row["users_errored"], + ), + ColumnDefinition("count_unique(user.errored)", ["users_errored"], 0), + ColumnDefinition("count_unique(user.abnormal)", ["users_abnormal"], 0), + ColumnDefinition("count_unique(user.crashed)", ["users_crashed"], 0), + ColumnDefinition("p50(session.duration)", ["duration_quantiles"], None, extract_quantile(50)), + ColumnDefinition("p75(session.duration)", ["duration_quantiles"], None, extract_quantile(75)), + ColumnDefinition("p90(session.duration)", ["duration_quantiles"], None, extract_quantile(90)), + ColumnDefinition("p95(session.duration)", ["duration_quantiles"], None, extract_quantile(95)), + ColumnDefinition("p99(session.duration)", ["duration_quantiles"], None, extract_quantile(99)), + ColumnDefinition("max(session.duration)", ["duration_quantiles"], None, extract_quantile(100)), + ColumnDefinition("avg(session.duration)", ["duration_avg"], None), + ColumnDefinition("release", ["release"], ""), + ColumnDefinition("environment", ["environment"], ""), + ColumnDefinition("user_agent", ["user_agent"], ""), + ColumnDefinition("os", ["os"], ""), +] + +COLUMN_MAP = {column.name: column for column in COLUMNS} + + +class QueryDefinition(object): + """ + This is the definition of the query the user wants to execute. + This is constructed out of the request params, and also contains a list of + `fields` and `groupby` definitions as [`ColumnDefinition`] objects. + """ + + def __init__(self, request, params): + # self.request = request + # self.params = params + + self.query = request.GET.get("query") + raw_fields = request.GET.getlist("field", []) + raw_groupby = request.GET.getlist("groupBy", []) + + # TODO: maybe show a proper error message for unknown fields/groupby + self.fields = filter(None, (COLUMN_MAP.get(field) for field in raw_fields)) + self.groupby = filter(None, (COLUMN_MAP.get(field) for field in raw_groupby)) + + rollup = get_rollup_from_request( + request, + params, + "1h", + InvalidSearchQuery( + "Your interval and date range would create too many results. " + "Use a larger interval, or a smaller date range." + ), + ) + # The minimum interval is one hour on the server + self.rollup = max(rollup, 3600) + + def extract_columns(lists): + columns = set() + for l in lists: + for field in l: + for column in field.snuba_columns: + columns.add(column) + return list(columns) + + self.query_columns = extract_columns([self.fields, self.groupby]) + self.query_groupby = extract_columns([self.groupby]) + + snuba_filter = get_timeseries_snuba_filter( + self.query_columns, self.query, params, self.rollup + ) + + self.start = snuba_filter.start + self.end = snuba_filter.end + self.aggregations = snuba_filter.aggregations + self.conditions = snuba_filter.conditions + + self.filter_keys = snuba_filter.filter_keys + + +def get_timeseries_snuba_filter(selected_columns, query, params, rollup, default_count=True): + snuba_filter = get_filter(query, params) + if not snuba_filter.start and not snuba_filter.end: + raise InvalidSearchQuery("Cannot get timeseries result without a start and end.") + + return snuba_filter + + +def _get_query_from_request(request): + """ + This creates our [`QueryDefinition`] from the given `request`, which itself + is a bit unfortunate that this depends on HTTP types. + """ + start, end = get_date_range_from_params(request.GET) + params = {"start": start, "end": end} + # TODO: refactor all of this so it does not depend on a `request`. + return QueryDefinition(request, params) + + +TS_COL = "bucketed_started" + + +def run_sessions_query(query): + """ + Runs the `query` as defined by [`QueryDefinition`] two times, once for the + `totals` and again for the actual time-series data grouped by the requested + interval. + """ + result_totals = raw_query( + dataset=Dataset.Sessions, + selected_columns=query.query_columns, + groupby=query.query_groupby, + aggregations=query.aggregations, + conditions=query.conditions, + filter_keys=query.filter_keys, + start=query.start, + end=query.end, + rollup=query.rollup, + referrer="sessions.totals", + ) + + result_timeseries = raw_query( + dataset=Dataset.Sessions, + selected_columns=[TS_COL] + query.query_columns, + groupby=[TS_COL] + query.query_groupby, + aggregations=query.aggregations, + conditions=query.conditions, + filter_keys=query.filter_keys, + start=query.start, + end=query.end, + rollup=query.rollup, + referrer="sessions.timeseries", + ) + + return result_totals["data"], result_timeseries["data"] + + +def massage_sessions_result(query, result_totals, result_timeseries): + """ + Post-processes the query result. + + Given the `query` as defined by [`QueryDefinition`] and its totals and + timeseries results from snuba, groups and transforms the result into the + expected format. + + For example: + ```json + { + "intervals": [ + "2020-12-16T00:00:00Z", + "2020-12-16T12:00:00Z", + "2020-12-17T00:00:00Z" + ], + "groups": [ + { + "by": { "release": "99b8edc5a3bb49d01d16426d7bb9c511ec41f81e" }, + "series": { "sum(session)": [0, 1, 0] }, + "totals": { "sum(session)": 1 } + }, + { + "by": { "release": "test-example-release" }, + "series": { "sum(session)": [0, 10, 20] }, + "totals": { "sum(session)": 30 } + } + ] + } + ``` + """ + timestamps = _get_timestamps(query) + + def group_fn(row): + return tuple(field.extract(row) for field in query.groupby) + + total_groups = sane_groupby(result_totals, group_fn) + timeseries_groups = sane_groupby(result_timeseries, group_fn) + + def make_timeseries(group): + for row in group: + row[TS_COL] = row[TS_COL][:19] + "Z" + + group.sort(key=lambda row: row[TS_COL]) + fields = [(field, list()) for field in query.fields] + group_index = 0 + + while group_index < len(group): + row = group[group_index] + if row[TS_COL] < timestamps[0]: + group_index += 1 + else: + break + + for ts in timestamps: + row = group[group_index] if group_index < len(group) else None + if row is not None and row[TS_COL] == ts: + group_index += 1 + else: + row = None + + for (field, series) in fields: + series.append(field.extract(row)) + + return {field.name: series for (field, series) in fields} + + def make_totals(totals): + return {field.name: field.extract(totals[0]) for field in query.fields} + + groups = [ + { + "by": {field.name: key[i] for i, field in enumerate(query.groupby)}, + "totals": make_totals(totals), + "series": make_timeseries(timeseries_groups[key]), + } + for key, totals in total_groups.items() + ] + + return { + # "query": query.query, + "intervals": timestamps, + "groups": groups, + } + + +def _get_timestamps(query): + """ + Generates a list of timestamps according to `query`. + The timestamps are returned as ISO strings for now. + """ + rollup = query.rollup + start = int(to_naive_timestamp(naiveify_datetime(query.start)) / rollup) * rollup + end = (int(to_naive_timestamp(naiveify_datetime(query.end)) / rollup) * rollup) + rollup + return [ + datetime.utcfromtimestamp(ts).isoformat() + "Z" + for ts in six.moves.xrange(start, end, rollup) + ] + + +def sane_groupby(it, keyfn): + """ + Basically the same as `itertools.groupby`, but without the requirement to + have the iterable sorted already by the keys, which can be super confusing + and breaks in surprising ways. + """ + groups = {} + for elem in it: + key = keyfn(elem) + if key not in groups: + groups[key] = [] + groups[key].append(elem) + + return groups diff --git a/tests/snuba/sessions/test_sessions_v2.py b/tests/snuba/sessions/test_sessions_v2.py new file mode 100644 index 00000000000000..f30133aae7d3ac --- /dev/null +++ b/tests/snuba/sessions/test_sessions_v2.py @@ -0,0 +1,113 @@ +from __future__ import absolute_import + +from freezegun import freeze_time +from django.http import QueryDict + +# from sentry.testutils import TestCase +from sentry.snuba.sessions_v2 import ( + massage_sessions_result, + _get_query_from_request, + _get_timestamps, +) + + +class MockRequest(object): + def __init__(self, qs): + self.GET = QueryDict(qs) + + +@freeze_time("2020-12-18T11:14:17.105Z") +def test_timestamps(): + query = _get_query_from_request(MockRequest("statsPeriod=1d&interval=12h")) + + expected_timestamps = ["2020-12-17T00:00:00Z", "2020-12-17T12:00:00Z", "2020-12-18T00:00:00Z"] + actual_timestamps = _get_timestamps(query) + + assert actual_timestamps == expected_timestamps + + +@freeze_time("2020-12-18T11:14:17.105Z") +def test_massage_simple_timeseries(): + """A timeseries is filled up when it only receives partial data""" + + query = _get_query_from_request(MockRequest("statsPeriod=1d&interval=6h&field=sum(session)")) + result_totals = [{"sessions": 4}] + # snuba returns the datetimes as strings for now + result_timeseries = [ + {"sessions": 2, "bucketed_started": "2020-12-17T12:00:00+00:00"}, + {"sessions": 2, "bucketed_started": "2020-12-18T06:00:00+00:00"}, + ] + + expected_result = { + "intervals": [ + "2020-12-17T06:00:00Z", + "2020-12-17T12:00:00Z", + "2020-12-17T18:00:00Z", + "2020-12-18T00:00:00Z", + "2020-12-18T06:00:00Z", + ], + "groups": [ + {"series": {"sum(session)": [0, 2, 0, 0, 2]}, "by": {}, "totals": {"sum(session)": 4}} + ], + } + + actual_result = massage_sessions_result(query, result_totals, result_timeseries) + + assert actual_result == expected_result + + +@freeze_time("2020-12-18T11:14:17.105Z") +def test_massage_groupby_timeseries(): + """A timeseries is filled up when it only receives partial data""" + + query = _get_query_from_request( + MockRequest("statsPeriod=1d&interval=6h&field=sum(session)&groupBy=release") + ) + result_totals = [ + {"release": "test-example-release", "sessions": 4}, + {"release": "test-example-release-2", "sessions": 1}, + ] + # snuba returns the datetimes as strings for now + result_timeseries = [ + { + "release": "test-example-release", + "sessions": 2, + "bucketed_started": "2020-12-17T12:00:00+00:00", + }, + { + "release": "test-example-release", + "sessions": 2, + "bucketed_started": "2020-12-18T06:00:00+00:00", + }, + { + "release": "test-example-release-2", + "sessions": 1, + "bucketed_started": "2020-12-18T06:00:00+00:00", + }, + ] + + expected_result = { + "intervals": [ + "2020-12-17T06:00:00Z", + "2020-12-17T12:00:00Z", + "2020-12-17T18:00:00Z", + "2020-12-18T00:00:00Z", + "2020-12-18T06:00:00Z", + ], + "groups": [ + { + "series": {"sum(session)": [0, 2, 0, 0, 2]}, + "by": {"release": "test-example-release"}, + "totals": {"sum(session)": 4}, + }, + { + "series": {"sum(session)": [0, 0, 0, 0, 1]}, + "by": {"release": "test-example-release-2"}, + "totals": {"sum(session)": 1}, + }, + ], + } + + actual_result = massage_sessions_result(query, result_totals, result_timeseries) + + assert actual_result == expected_result From 65c266fc774f74b41863a262ad18949e745ef1c1 Mon Sep 17 00:00:00 2001 From: Arpad Borsos Date: Fri, 18 Dec 2020 14:27:00 +0100 Subject: [PATCH 03/13] implement virtual groupby --- src/sentry/snuba/sessions_v2.py | 276 +++++++++++++---------- tests/snuba/sessions/test_sessions_v2.py | 129 ++++++++++- 2 files changed, 283 insertions(+), 122 deletions(-) diff --git a/src/sentry/snuba/sessions_v2.py b/src/sentry/snuba/sessions_v2.py index 091033bfbe1a33..4e450c591e19f4 100644 --- a/src/sentry/snuba/sessions_v2.py +++ b/src/sentry/snuba/sessions_v2.py @@ -3,10 +3,10 @@ from datetime import datetime import six +import itertools from sentry.api.event_search import get_filter, InvalidSearchQuery from sentry.api.utils import get_date_range_from_params -from sentry.utils.compat import filter from sentry.utils.dates import get_rollup_from_request from sentry.utils.snuba import Dataset, raw_query, to_naive_timestamp, naiveify_datetime @@ -79,85 +79,118 @@ """ -class ColumnDefinition(object): - """ - This defines the column mapping from a discover-like `name` to the - `snuba_columns` that feed into it. +class SessionsField(object): + def get_snuba_columns(self, raw_groupby): + if "session.status" in raw_groupby: + return ["sessions", "sessions_abnormal", "sessions_crashed", "sessions_errored"] + return ["sessions"] + + def extract_from_row(self, row, group): + if row is None: + return 0 + status = group.get("session.status") + if status is None: + return row["sessions"] + if status == "healthy": + return row["sessions"] - row["sessions_errored"] + if status == "abnormal": + return row["sessions_abnormal"] + if status == "crashed": + return row["sessions_crashed"] + if status == "errored": + return row["sessions_errored"] + return 0 + + +class UsersField(object): + def get_snuba_columns(self, raw_groupby): + if "session.status" in raw_groupby: + return ["users", "users_abnormal", "users_crashed", "users_errored"] + return ["users"] + + def extract_from_row(self, row, group): + if row is None: + return 0 + status = group.get("session.status") + if status is None: + return row["users"] + if status == "healthy": + return row["users"] - row["users_errored"] + if status == "abnormal": + return row["users_abnormal"] + if status == "crashed": + return row["users_crashed"] + if status == "errored": + return row["users_errored"] + return 0 + + +class DurationAverageField(object): + def get_snuba_columns(self, raw_groupby): + return ["duration_avg"] + + def extract_from_row(self, row, group): + if row is None: + return None + return row["duration_avg"] - An `extractor` function can be given that transforms the raw data columns - into the expected output. By default, it will just use the single `snuba_columns`. - A `default` must also be provided which is used when the row does not contain - any data. - """ +class DurationQuantileField(object): + def __init__(self, quantile_index): + self.quantile_index = quantile_index - def __init__(self, name, snuba_columns, default, extractor=None): - self.name = name - self.snuba_columns = snuba_columns - self.default = default - self.extractor = extractor + def get_snuba_columns(self, raw_groupby): + return ["duration_quantiles"] - def extract(self, row): + def extract_from_row(self, row, group): if row is None: - return self.default - - if self.extractor is not None: - value = self.extractor(row) - elif len(self.snuba_columns) == 1: - value = row[self.snuba_columns[0]] - else: - return self.default - - return value if value is not None else self.default - - -# Lets assume we have a recent enough snuba. -# TODO: Also, maybe we can run a custom aggregation over the `duration_quantiles`? -QUANTILE_MAP = {50: 0, 75: 1, 90: 2, 95: 3, 99: 4, 100: 5} - - -def extract_quantile(num): - def inner(row): - return row["duration_quantiles"][QUANTILE_MAP[num]] - - return inner - - -COLUMNS = [ - ColumnDefinition("sum(session)", ["sessions"], 0), - ColumnDefinition( - "sum(session.healthy)", - ["sessions", "sessions_errored"], - 0, - lambda row: row["sessions"] - row["sessions_errored"], - ), - ColumnDefinition("sum(session.errored)", ["sessions_errored"], 0), - ColumnDefinition("sum(session.abnormal)", ["sessions_abnormal"], 0), - ColumnDefinition("sum(session.crashed)", ["sessions_crashed"], 0), - ColumnDefinition("count_unique(user)", ["users"], 0), - ColumnDefinition( - "count_unique(user.healthy)", - ["users", "users_errored"], - 0, - lambda row: row["users"] - row["users_errored"], - ), - ColumnDefinition("count_unique(user.errored)", ["users_errored"], 0), - ColumnDefinition("count_unique(user.abnormal)", ["users_abnormal"], 0), - ColumnDefinition("count_unique(user.crashed)", ["users_crashed"], 0), - ColumnDefinition("p50(session.duration)", ["duration_quantiles"], None, extract_quantile(50)), - ColumnDefinition("p75(session.duration)", ["duration_quantiles"], None, extract_quantile(75)), - ColumnDefinition("p90(session.duration)", ["duration_quantiles"], None, extract_quantile(90)), - ColumnDefinition("p95(session.duration)", ["duration_quantiles"], None, extract_quantile(95)), - ColumnDefinition("p99(session.duration)", ["duration_quantiles"], None, extract_quantile(99)), - ColumnDefinition("max(session.duration)", ["duration_quantiles"], None, extract_quantile(100)), - ColumnDefinition("avg(session.duration)", ["duration_avg"], None), - ColumnDefinition("release", ["release"], ""), - ColumnDefinition("environment", ["environment"], ""), - ColumnDefinition("user_agent", ["user_agent"], ""), - ColumnDefinition("os", ["os"], ""), -] + return None + return row["duration_quantiles"][self.quantile_index] + + +COLUMN_MAP = { + "sum(session)": SessionsField(), + "count_unique(user)": UsersField(), + "avg(session.duration)": DurationAverageField(), + "p50(session.duration)": DurationQuantileField(0), + "p75(session.duration)": DurationQuantileField(1), + "p90(session.duration)": DurationQuantileField(2), + "p95(session.duration)": DurationQuantileField(3), + "p99(session.duration)": DurationQuantileField(4), + "max(session.duration)": DurationQuantileField(5), +} + + +class SimpleGroupBy(object): + def __init__(self, name): + self.name = name + + def get_snuba_columns(self): + return [self.name] + + def get_snuba_groupby(self): + return [self.name] + + def get_keys_for_row(self, row): + return [(self.name, row[self.name])] + -COLUMN_MAP = {column.name: column for column in COLUMNS} +class SessionStatusGroupBy(object): + def get_snuba_columns(self): + return [] + + def get_snuba_groupby(self): + return [] + + def get_keys_for_row(self, row): + return [("session.status", key) for key in ["healthy", "abnormal", "crashed", "errored"]] + + +GROUPBY_MAP = { + "environment": SimpleGroupBy("environment"), + "release": SimpleGroupBy("release"), + "session.status": SessionStatusGroupBy(), +} class QueryDefinition(object): @@ -176,8 +209,8 @@ def __init__(self, request, params): raw_groupby = request.GET.getlist("groupBy", []) # TODO: maybe show a proper error message for unknown fields/groupby - self.fields = filter(None, (COLUMN_MAP.get(field) for field in raw_fields)) - self.groupby = filter(None, (COLUMN_MAP.get(field) for field in raw_groupby)) + self.fields = {field: COLUMN_MAP.get(field) for field in raw_fields} + self.groupby = [GROUPBY_MAP.get(field) for field in raw_groupby] rollup = get_rollup_from_request( request, @@ -191,16 +224,17 @@ def __init__(self, request, params): # The minimum interval is one hour on the server self.rollup = max(rollup, 3600) - def extract_columns(lists): - columns = set() - for l in lists: - for field in l: - for column in field.snuba_columns: - columns.add(column) - return list(columns) + query_columns = set() + for field in self.fields.values(): + query_columns.update(field.get_snuba_columns(raw_groupby)) + for groupby in self.groupby: + query_columns.update(groupby.get_snuba_columns()) + self.query_columns = list(query_columns) - self.query_columns = extract_columns([self.fields, self.groupby]) - self.query_groupby = extract_columns([self.groupby]) + query_groupby = set() + for groupby in self.groupby: + query_groupby.update(groupby.get_snuba_groupby()) + self.query_groupby = list(query_groupby) snuba_filter = get_timeseries_snuba_filter( self.query_columns, self.query, params, self.rollup @@ -304,50 +338,52 @@ def massage_sessions_result(query, result_totals, result_timeseries): """ timestamps = _get_timestamps(query) - def group_fn(row): - return tuple(field.extract(row) for field in query.groupby) - - total_groups = sane_groupby(result_totals, group_fn) - timeseries_groups = sane_groupby(result_timeseries, group_fn) + total_groups = _split_rows_groupby(result_totals, query.groupby) + timeseries_groups = _split_rows_groupby(result_timeseries, query.groupby) - def make_timeseries(group): - for row in group: + def make_timeseries(rows, group): + for row in rows: row[TS_COL] = row[TS_COL][:19] + "Z" - group.sort(key=lambda row: row[TS_COL]) - fields = [(field, list()) for field in query.fields] + rows.sort(key=lambda row: row[TS_COL]) + fields = [(name, field, list()) for name, field in query.fields.items()] group_index = 0 - while group_index < len(group): - row = group[group_index] + while group_index < len(rows): + row = rows[group_index] if row[TS_COL] < timestamps[0]: group_index += 1 else: break for ts in timestamps: - row = group[group_index] if group_index < len(group) else None + row = rows[group_index] if group_index < len(rows) else None if row is not None and row[TS_COL] == ts: group_index += 1 else: row = None - for (field, series) in fields: - series.append(field.extract(row)) + for (name, field, series) in fields: + series.append(field.extract_from_row(row, group)) - return {field.name: series for (field, series) in fields} + return {name: series for (name, field, series) in fields} - def make_totals(totals): - return {field.name: field.extract(totals[0]) for field in query.fields} + def make_totals(totals, group): + return { + name: field.extract_from_row(totals[0], group) for name, field in query.fields.items() + } - groups = [ - { - "by": {field.name: key[i] for i, field in enumerate(query.groupby)}, - "totals": make_totals(totals), - "series": make_timeseries(timeseries_groups[key]), + groups = [] + for key, totals in total_groups.items(): + by = dict(key) + + group = { + "by": by, + "totals": make_totals(totals, by), + "series": make_timeseries(timeseries_groups[key], by), } - for key, totals in total_groups.items() - ] + + groups.append(group) return { # "query": query.query, @@ -370,17 +406,17 @@ def _get_timestamps(query): ] -def sane_groupby(it, keyfn): - """ - Basically the same as `itertools.groupby`, but without the requirement to - have the iterable sorted already by the keys, which can be super confusing - and breaks in surprising ways. - """ +def _split_rows_groupby(rows, groupby): groups = {} - for elem in it: - key = keyfn(elem) - if key not in groups: - groups[key] = [] - groups[key].append(elem) + for row in rows: + key_parts = (group.get_keys_for_row(row) for group in groupby) + keys = itertools.product(*key_parts) + + for key in keys: + key = frozenset(key) + + if key not in groups: + groups[key] = [] + groups[key].append(row) return groups diff --git a/tests/snuba/sessions/test_sessions_v2.py b/tests/snuba/sessions/test_sessions_v2.py index f30133aae7d3ac..45aa78ed2211bb 100644 --- a/tests/snuba/sessions/test_sessions_v2.py +++ b/tests/snuba/sessions/test_sessions_v2.py @@ -26,6 +26,47 @@ def test_timestamps(): assert actual_timestamps == expected_timestamps +def test_simple_query(): + query = _get_query_from_request(MockRequest("statsPeriod=1d&interval=12h&field=sum(session)")) + + assert query.query_columns == ["sessions"] + + +def test_groupby_query(): + query = _get_query_from_request( + MockRequest("statsPeriod=1d&interval=12h&field=sum(session)&groupBy=release") + ) + + assert sorted(query.query_columns) == ["release", "sessions"] + assert query.query_groupby == ["release"] + + +def test_virtual_groupby_query(): + query = _get_query_from_request( + MockRequest("statsPeriod=1d&interval=12h&field=sum(session)&groupBy=session.status") + ) + + assert sorted(query.query_columns) == [ + "sessions", + "sessions_abnormal", + "sessions_crashed", + "sessions_errored", + ] + assert query.query_groupby == [] + + query = _get_query_from_request( + MockRequest("statsPeriod=1d&interval=12h&field=count_unique(user)&groupBy=session.status") + ) + + assert sorted(query.query_columns) == [ + "users", + "users_abnormal", + "users_crashed", + "users_errored", + ] + assert query.query_groupby == [] + + @freeze_time("2020-12-18T11:14:17.105Z") def test_massage_simple_timeseries(): """A timeseries is filled up when it only receives partial data""" @@ -58,8 +99,6 @@ def test_massage_simple_timeseries(): @freeze_time("2020-12-18T11:14:17.105Z") def test_massage_groupby_timeseries(): - """A timeseries is filled up when it only receives partial data""" - query = _get_query_from_request( MockRequest("statsPeriod=1d&interval=6h&field=sum(session)&groupBy=release") ) @@ -111,3 +150,89 @@ def test_massage_groupby_timeseries(): actual_result = massage_sessions_result(query, result_totals, result_timeseries) assert actual_result == expected_result + + +@freeze_time("2020-12-18T13:25:15.769Z") +def test_massage_virtual_groupby_timeseries(): + query = _get_query_from_request( + MockRequest( + "statsPeriod=1d&interval=6h&field=sum(session)&field=count_unique(user)&groupBy=session.status" + ) + ) + result_totals = [ + { + "users": 1, + "users_crashed": 1, + "sessions": 6, + "sessions_errored": 1, + "users_errored": 1, + "sessions_abnormal": 0, + "sessions_crashed": 1, + "users_abnormal": 0, + } + ] + # snuba returns the datetimes as strings for now + result_timeseries = [ + { + "sessions_errored": 1, + "users": 1, + "users_crashed": 1, + "sessions_abnormal": 0, + "sessions": 3, + "users_errored": 1, + "users_abnormal": 0, + "sessions_crashed": 1, + "bucketed_started": "2020-12-18T12:00:00+00:00", + }, + { + "sessions_errored": 0, + "users": 1, + "users_crashed": 0, + "sessions_abnormal": 0, + "sessions": 3, + "users_errored": 0, + "users_abnormal": 0, + "sessions_crashed": 0, + "bucketed_started": "2020-12-18T06:00:00+00:00", + }, + ] + + expected_result = { + "intervals": [ + "2020-12-17T12:00:00Z", + "2020-12-17T18:00:00Z", + "2020-12-18T00:00:00Z", + "2020-12-18T06:00:00Z", + "2020-12-18T12:00:00Z", + ], + "groups": [ + { + "series": {"count_unique(user)": [0, 0, 0, 1, 0], "sum(session)": [0, 0, 0, 3, 2]}, + "by": {"session.status": "healthy"}, + # while in one of the time slots, we have a healthy user, it is + # the *same* user as the one experiencing a crash later on, + # so in the *whole* time window, that one user is not counted as healthy, + # so the `0` here is expected, as thats an example of the `count_unique` behavior. + "totals": {"count_unique(user)": 0, "sum(session)": 5}, + }, + { + "series": {"count_unique(user)": [0, 0, 0, 0, 1], "sum(session)": [0, 0, 0, 0, 1]}, + "by": {"session.status": "crashed"}, + "totals": {"count_unique(user)": 1, "sum(session)": 1}, + }, + { + "series": {"count_unique(user)": [0, 0, 0, 0, 1], "sum(session)": [0, 0, 0, 0, 1]}, + "by": {"session.status": "errored"}, + "totals": {"count_unique(user)": 1, "sum(session)": 1}, + }, + { + "series": {"count_unique(user)": [0, 0, 0, 0, 0], "sum(session)": [0, 0, 0, 0, 0]}, + "by": {"session.status": "abnormal"}, + "totals": {"count_unique(user)": 0, "sum(session)": 0}, + }, + ], + } + + actual_result = massage_sessions_result(query, result_totals, result_timeseries) + + assert actual_result == expected_result From 31b577471a1dce58ad2b6b111163a7f9cdb81c08 Mon Sep 17 00:00:00 2001 From: Arpad Borsos Date: Fri, 18 Dec 2020 14:53:15 +0100 Subject: [PATCH 04/13] restrict session.duration to healthy sessions only --- src/sentry/snuba/sessions_v2.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/sentry/snuba/sessions_v2.py b/src/sentry/snuba/sessions_v2.py index 4e450c591e19f4..c579610c3c3a6d 100644 --- a/src/sentry/snuba/sessions_v2.py +++ b/src/sentry/snuba/sessions_v2.py @@ -132,7 +132,10 @@ def get_snuba_columns(self, raw_groupby): def extract_from_row(self, row, group): if row is None: return None - return row["duration_avg"] + status = group.get("session.status") + if status is None or status == "healthy": + return row["duration_avg"] + return None class DurationQuantileField(object): @@ -145,7 +148,10 @@ def get_snuba_columns(self, raw_groupby): def extract_from_row(self, row, group): if row is None: return None - return row["duration_quantiles"][self.quantile_index] + status = group.get("session.status") + if status is None or status == "healthy": + return row["duration_quantiles"][self.quantile_index] + return None COLUMN_MAP = { From 128b3a1073cc6c8f0d03e7af21f32ab4216ba2ac Mon Sep 17 00:00:00 2001 From: Arpad Borsos Date: Tue, 5 Jan 2021 11:27:16 +0100 Subject: [PATCH 05/13] ensure stable tests --- .../api/endpoints/organization_sessions.py | 2 - src/sentry/snuba/sessions_v2.py | 2 +- tests/snuba/sessions/test_sessions_v2.py | 46 +++++++++++-------- 3 files changed, 29 insertions(+), 21 deletions(-) diff --git a/src/sentry/api/endpoints/organization_sessions.py b/src/sentry/api/endpoints/organization_sessions.py index 5e8bea0592eae7..3306ec840aa9a7 100644 --- a/src/sentry/api/endpoints/organization_sessions.py +++ b/src/sentry/api/endpoints/organization_sessions.py @@ -20,8 +20,6 @@ def get(self, request, organization): with sentry_sdk.start_span(op="sessions.endpoint", description="run_sessions_query"): result_totals, result_timeseries = run_sessions_query(query) - # print(result_totals, result_timeseries) - with sentry_sdk.start_span(op="sessions.endpoint", description="massage_sessions_result"): result = massage_sessions_result(query, result_totals, result_timeseries) return Response(result, status=200) diff --git a/src/sentry/snuba/sessions_v2.py b/src/sentry/snuba/sessions_v2.py index c579610c3c3a6d..92196bb73903f5 100644 --- a/src/sentry/snuba/sessions_v2.py +++ b/src/sentry/snuba/sessions_v2.py @@ -20,7 +20,7 @@ We have basically 3 "metrics" that we can query: - `session` (counter): The number of sessions that occurred -- `user` (set): The set of `dinstinct_id`s. +- `user` (set): The set of `distinct_id`s. - `session.duration` (histogram): The duration of individual sessions (not available for pre-aggregated sessions) diff --git a/tests/snuba/sessions/test_sessions_v2.py b/tests/snuba/sessions/test_sessions_v2.py index 45aa78ed2211bb..00183d53d61ade 100644 --- a/tests/snuba/sessions/test_sessions_v2.py +++ b/tests/snuba/sessions/test_sessions_v2.py @@ -16,6 +16,16 @@ def __init__(self, qs): self.GET = QueryDict(qs) +def result_sorted(result): + """sort the groups of the results array by the `by` object, ensuring a stable order""" + + def stable_dict(d): + return tuple(sorted(d.items(), key=lambda t: t[0])) + + result["groups"].sort(key=lambda group: stable_dict(group["by"])) + return result + + @freeze_time("2020-12-18T11:14:17.105Z") def test_timestamps(): query = _get_query_from_request(MockRequest("statsPeriod=1d&interval=12h")) @@ -88,11 +98,11 @@ def test_massage_simple_timeseries(): "2020-12-18T06:00:00Z", ], "groups": [ - {"series": {"sum(session)": [0, 2, 0, 0, 2]}, "by": {}, "totals": {"sum(session)": 4}} + {"by": {}, "series": {"sum(session)": [0, 2, 0, 0, 2]}, "totals": {"sum(session)": 4}} ], } - actual_result = massage_sessions_result(query, result_totals, result_timeseries) + actual_result = result_sorted(massage_sessions_result(query, result_totals, result_timeseries)) assert actual_result == expected_result @@ -135,19 +145,19 @@ def test_massage_groupby_timeseries(): ], "groups": [ { - "series": {"sum(session)": [0, 2, 0, 0, 2]}, "by": {"release": "test-example-release"}, + "series": {"sum(session)": [0, 2, 0, 0, 2]}, "totals": {"sum(session)": 4}, }, { - "series": {"sum(session)": [0, 0, 0, 0, 1]}, "by": {"release": "test-example-release-2"}, + "series": {"sum(session)": [0, 0, 0, 0, 1]}, "totals": {"sum(session)": 1}, }, ], } - actual_result = massage_sessions_result(query, result_totals, result_timeseries) + actual_result = result_sorted(massage_sessions_result(query, result_totals, result_timeseries)) assert actual_result == expected_result @@ -207,32 +217,32 @@ def test_massage_virtual_groupby_timeseries(): ], "groups": [ { - "series": {"count_unique(user)": [0, 0, 0, 1, 0], "sum(session)": [0, 0, 0, 3, 2]}, - "by": {"session.status": "healthy"}, - # while in one of the time slots, we have a healthy user, it is - # the *same* user as the one experiencing a crash later on, - # so in the *whole* time window, that one user is not counted as healthy, - # so the `0` here is expected, as thats an example of the `count_unique` behavior. - "totals": {"count_unique(user)": 0, "sum(session)": 5}, + "by": {"session.status": "abnormal"}, + "series": {"count_unique(user)": [0, 0, 0, 0, 0], "sum(session)": [0, 0, 0, 0, 0]}, + "totals": {"count_unique(user)": 0, "sum(session)": 0}, }, { - "series": {"count_unique(user)": [0, 0, 0, 0, 1], "sum(session)": [0, 0, 0, 0, 1]}, "by": {"session.status": "crashed"}, + "series": {"count_unique(user)": [0, 0, 0, 0, 1], "sum(session)": [0, 0, 0, 0, 1]}, "totals": {"count_unique(user)": 1, "sum(session)": 1}, }, { - "series": {"count_unique(user)": [0, 0, 0, 0, 1], "sum(session)": [0, 0, 0, 0, 1]}, "by": {"session.status": "errored"}, + "series": {"count_unique(user)": [0, 0, 0, 0, 1], "sum(session)": [0, 0, 0, 0, 1]}, "totals": {"count_unique(user)": 1, "sum(session)": 1}, }, { - "series": {"count_unique(user)": [0, 0, 0, 0, 0], "sum(session)": [0, 0, 0, 0, 0]}, - "by": {"session.status": "abnormal"}, - "totals": {"count_unique(user)": 0, "sum(session)": 0}, + "by": {"session.status": "healthy"}, + "series": {"count_unique(user)": [0, 0, 0, 1, 0], "sum(session)": [0, 0, 0, 3, 2]}, + # while in one of the time slots, we have a healthy user, it is + # the *same* user as the one experiencing a crash later on, + # so in the *whole* time window, that one user is not counted as healthy, + # so the `0` here is expected, as thats an example of the `count_unique` behavior. + "totals": {"count_unique(user)": 0, "sum(session)": 5}, }, ], } - actual_result = massage_sessions_result(query, result_totals, result_timeseries) + actual_result = result_sorted(massage_sessions_result(query, result_totals, result_timeseries)) assert actual_result == expected_result From 717a32cad80f2fed46aba4e383ba5268940f4bc1 Mon Sep 17 00:00:00 2001 From: Arpad Borsos Date: Tue, 12 Jan 2021 13:44:38 +0100 Subject: [PATCH 06/13] implement groupBy project --- src/sentry/snuba/sessions_v2.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/sentry/snuba/sessions_v2.py b/src/sentry/snuba/sessions_v2.py index 92196bb73903f5..99f2462e90cbdb 100644 --- a/src/sentry/snuba/sessions_v2.py +++ b/src/sentry/snuba/sessions_v2.py @@ -38,6 +38,7 @@ The Session data can be grouped by a set of tags, which can only appear in the `groupBy` of the query. +- `project` - `environment` - `release`: TODO: describe specific release filters such as `release.major`, etc @@ -168,17 +169,18 @@ def extract_from_row(self, row, group): class SimpleGroupBy(object): - def __init__(self, name): - self.name = name + def __init__(self, row_name, name=None): + self.row_name = row_name + self.name = name or row_name def get_snuba_columns(self): - return [self.name] + return [self.row_name] def get_snuba_groupby(self): - return [self.name] + return [self.row_name] def get_keys_for_row(self, row): - return [(self.name, row[self.name])] + return [(self.name, row[self.row_name])] class SessionStatusGroupBy(object): @@ -193,6 +195,7 @@ def get_keys_for_row(self, row): GROUPBY_MAP = { + "project": SimpleGroupBy("project_id", "project"), "environment": SimpleGroupBy("environment"), "release": SimpleGroupBy("release"), "session.status": SessionStatusGroupBy(), From da5c49ba2857bd6ce6f99e5835dd4dd45fffa4a8 Mon Sep 17 00:00:00 2001 From: Arpad Borsos Date: Wed, 13 Jan 2021 13:41:05 +0100 Subject: [PATCH 07/13] add a new util to get a date range with rollup window --- src/sentry/api/utils.py | 43 ++++++++++++++++++++++++++++++++-- tests/sentry/api/test_utils.py | 43 +++++++++++++++++++++++++++++++++- 2 files changed, 83 insertions(+), 3 deletions(-) diff --git a/src/sentry/api/utils.py b/src/sentry/api/utils.py index 3b6e010b390daf..f91f49ce73e435 100644 --- a/src/sentry/api/utils.py +++ b/src/sentry/api/utils.py @@ -2,12 +2,13 @@ from datetime import timedelta +import math import six from django.utils import timezone from sentry.search.utils import parse_datetime_string, InvalidQuery -from sentry.utils.dates import parse_stats_period - +from sentry.utils.dates import parse_stats_period, to_timestamp, to_datetime +from sentry.constants import MAX_ROLLUP_POINTS MAX_STATS_PERIOD = timedelta(days=90) @@ -83,3 +84,41 @@ def get_date_range_from_params(params, optional=False): raise InvalidParams("start must be before end") return start, end + + +def get_date_range_rollup_from_params( + params, + minimum_interval="1h", + default_interval="", + round_range=False, + max_points=MAX_ROLLUP_POINTS, +): + """ + Similar to `get_date_range_from_params`, but this also parses and validates + an `interval`, as `get_rollup_from_request` would do. + + This also optionally rounds the returned range to the given `interval`. + """ + minimum_interval = parse_stats_period(minimum_interval).total_seconds() + interval = parse_stats_period(params.get("interval", default_interval)) + interval = minimum_interval if interval is None else interval.total_seconds() + if interval <= 0: + raise InvalidParams("Interval cannot result in a zero duration.") + + # round the interval up to the minimum + interval = int(minimum_interval * math.ceil(interval / minimum_interval)) + + start, end = get_date_range_from_params(params) + date_range = end - start + if date_range.total_seconds() / interval > max_points: + raise InvalidParams( + "Your interval and date range would create too many results. " + "Use a larger interval, or a smaller date range." + ) + + if round_range: + end_ts = int(interval * math.ceil(to_timestamp(end) / interval)) + end = to_datetime(end_ts) + start = end - date_range + + return start, end, interval diff --git a/tests/sentry/api/test_utils.py b/tests/sentry/api/test_utils.py index 36b384938f0722..9de0e93aeae0f8 100644 --- a/tests/sentry/api/test_utils.py +++ b/tests/sentry/api/test_utils.py @@ -5,7 +5,12 @@ from django.utils import timezone from freezegun import freeze_time -from sentry.api.utils import get_date_range_from_params, InvalidParams, MAX_STATS_PERIOD +from sentry.api.utils import ( + get_date_range_from_params, + get_date_range_rollup_from_params, + InvalidParams, + MAX_STATS_PERIOD, +) from sentry.testutils import TestCase @@ -54,3 +59,39 @@ def test_relative_date_range_incomplete(self): with self.assertRaises(InvalidParams): start, end = get_date_range_from_params({"statsPeriodStart": "14d"}) + + +class GetDateRangeRollupFromParamsTest(TestCase): + def test_intervals(self): + # defaults to 1h + start, end, interval = get_date_range_rollup_from_params({}) + assert interval == 3600 + + # rounds up to a multiple of the minimum + start, end, interval = get_date_range_rollup_from_params( + {"statsPeriod": "14h", "interval": "8m"}, minimum_interval="5m" + ) + assert interval == 600 + + @freeze_time("2018-12-11 03:21:34") + def test_round_range(self): + start, end, interval = get_date_range_rollup_from_params( + {"statsPeriod": "2d"}, round_range=True + ) + assert start == datetime.datetime(2018, 12, 9, 4, tzinfo=timezone.utc) + assert end == datetime.datetime(2018, 12, 11, 4, tzinfo=timezone.utc) + + start, end, interval = get_date_range_rollup_from_params( + {"statsPeriod": "2d", "interval": "1d"}, round_range=True + ) + assert start == datetime.datetime(2018, 12, 10, tzinfo=timezone.utc) + assert end == datetime.datetime(2018, 12, 12, tzinfo=timezone.utc) + + def test_invalid_interval(self): + with self.assertRaises(InvalidParams): + start, end, interval = get_date_range_rollup_from_params({"interval": "0d"}) + with self.assertRaises(InvalidParams): + # defaults stats period is 90d + start, end, interval = get_date_range_rollup_from_params( + {"interval": "1d"}, max_points=80 + ) From 00d22fa41d97eea22bb4399fa2cf41b37068b191 Mon Sep 17 00:00:00 2001 From: Arpad Borsos Date: Wed, 13 Jan 2021 14:10:12 +0100 Subject: [PATCH 08/13] simplicfy code dealing with query rollups a bit --- .../api/endpoints/organization_sessions.py | 2 +- src/sentry/api/utils.py | 2 + src/sentry/snuba/sessions_v2.py | 62 +++++-------------- tests/snuba/sessions/test_sessions_v2.py | 55 +++++++--------- 4 files changed, 42 insertions(+), 79 deletions(-) diff --git a/src/sentry/api/endpoints/organization_sessions.py b/src/sentry/api/endpoints/organization_sessions.py index 3306ec840aa9a7..0b4f1255f7e899 100644 --- a/src/sentry/api/endpoints/organization_sessions.py +++ b/src/sentry/api/endpoints/organization_sessions.py @@ -28,4 +28,4 @@ def build_sessions_query(self, request, organization): with sentry_sdk.start_span(op="sessions.endpoint", description="build_sessions_query"): params = self.get_snuba_params(request, organization, check_global_views=False) - return QueryDefinition(request, params) + return QueryDefinition(request.GET, params) diff --git a/src/sentry/api/utils.py b/src/sentry/api/utils.py index f91f49ce73e435..c2b5d24a0e343f 100644 --- a/src/sentry/api/utils.py +++ b/src/sentry/api/utils.py @@ -98,6 +98,8 @@ def get_date_range_rollup_from_params( an `interval`, as `get_rollup_from_request` would do. This also optionally rounds the returned range to the given `interval`. + The rounding uses integer arithmetic on unix timestamps, so might yield + unexpected results when the interval is > 1d. """ minimum_interval = parse_stats_period(minimum_interval).total_seconds() interval = parse_stats_period(params.get("interval", default_interval)) diff --git a/src/sentry/snuba/sessions_v2.py b/src/sentry/snuba/sessions_v2.py index 99f2462e90cbdb..52c7278154a6cc 100644 --- a/src/sentry/snuba/sessions_v2.py +++ b/src/sentry/snuba/sessions_v2.py @@ -5,10 +5,10 @@ import six import itertools -from sentry.api.event_search import get_filter, InvalidSearchQuery -from sentry.api.utils import get_date_range_from_params -from sentry.utils.dates import get_rollup_from_request -from sentry.utils.snuba import Dataset, raw_query, to_naive_timestamp, naiveify_datetime +from sentry.api.event_search import get_filter +from sentry.api.utils import get_date_range_rollup_from_params +from sentry.utils.dates import to_timestamp +from sentry.utils.snuba import Dataset, raw_query """ The new Sessions API defines a "metrics"-like interface which is can be used in @@ -209,29 +209,22 @@ class QueryDefinition(object): `fields` and `groupby` definitions as [`ColumnDefinition`] objects. """ - def __init__(self, request, params): + def __init__(self, query, params): # self.request = request # self.params = params - self.query = request.GET.get("query") - raw_fields = request.GET.getlist("field", []) - raw_groupby = request.GET.getlist("groupBy", []) + self.query = query.get("query") + raw_fields = query.getlist("field", []) + raw_groupby = query.getlist("groupBy", []) # TODO: maybe show a proper error message for unknown fields/groupby self.fields = {field: COLUMN_MAP.get(field) for field in raw_fields} self.groupby = [GROUPBY_MAP.get(field) for field in raw_groupby] - rollup = get_rollup_from_request( - request, - params, - "1h", - InvalidSearchQuery( - "Your interval and date range would create too many results. " - "Use a larger interval, or a smaller date range." - ), - ) - # The minimum interval is one hour on the server - self.rollup = max(rollup, 3600) + start, end, rollup = get_date_range_rollup_from_params(query, "1h", round_range=True) + self.rollup = rollup + self.start = start + self.end = end query_columns = set() for field in self.fields.values(): @@ -245,37 +238,16 @@ def __init__(self, request, params): query_groupby.update(groupby.get_snuba_groupby()) self.query_groupby = list(query_groupby) - snuba_filter = get_timeseries_snuba_filter( - self.query_columns, self.query, params, self.rollup - ) + params["start"] = start + params["end"] = end + snuba_filter = get_filter(self.query, params) - self.start = snuba_filter.start - self.end = snuba_filter.end self.aggregations = snuba_filter.aggregations self.conditions = snuba_filter.conditions self.filter_keys = snuba_filter.filter_keys -def get_timeseries_snuba_filter(selected_columns, query, params, rollup, default_count=True): - snuba_filter = get_filter(query, params) - if not snuba_filter.start and not snuba_filter.end: - raise InvalidSearchQuery("Cannot get timeseries result without a start and end.") - - return snuba_filter - - -def _get_query_from_request(request): - """ - This creates our [`QueryDefinition`] from the given `request`, which itself - is a bit unfortunate that this depends on HTTP types. - """ - start, end = get_date_range_from_params(request.GET) - params = {"start": start, "end": end} - # TODO: refactor all of this so it does not depend on a `request`. - return QueryDefinition(request, params) - - TS_COL = "bucketed_started" @@ -407,8 +379,8 @@ def _get_timestamps(query): The timestamps are returned as ISO strings for now. """ rollup = query.rollup - start = int(to_naive_timestamp(naiveify_datetime(query.start)) / rollup) * rollup - end = (int(to_naive_timestamp(naiveify_datetime(query.end)) / rollup) * rollup) + rollup + start = int(to_timestamp(query.start)) + end = int(to_timestamp(query.end)) return [ datetime.utcfromtimestamp(ts).isoformat() + "Z" for ts in six.moves.xrange(start, end, rollup) diff --git a/tests/snuba/sessions/test_sessions_v2.py b/tests/snuba/sessions/test_sessions_v2.py index 00183d53d61ade..4d640b22923866 100644 --- a/tests/snuba/sessions/test_sessions_v2.py +++ b/tests/snuba/sessions/test_sessions_v2.py @@ -5,15 +5,14 @@ # from sentry.testutils import TestCase from sentry.snuba.sessions_v2 import ( + QueryDefinition, massage_sessions_result, - _get_query_from_request, _get_timestamps, ) -class MockRequest(object): - def __init__(self, qs): - self.GET = QueryDict(qs) +def _make_query(qs): + return QueryDefinition(QueryDict(qs), {}) def result_sorted(result): @@ -28,33 +27,29 @@ def stable_dict(d): @freeze_time("2020-12-18T11:14:17.105Z") def test_timestamps(): - query = _get_query_from_request(MockRequest("statsPeriod=1d&interval=12h")) + query = _make_query("statsPeriod=1d&interval=12h") - expected_timestamps = ["2020-12-17T00:00:00Z", "2020-12-17T12:00:00Z", "2020-12-18T00:00:00Z"] + expected_timestamps = ["2020-12-17T12:00:00Z", "2020-12-18T00:00:00Z"] actual_timestamps = _get_timestamps(query) assert actual_timestamps == expected_timestamps def test_simple_query(): - query = _get_query_from_request(MockRequest("statsPeriod=1d&interval=12h&field=sum(session)")) + query = _make_query("statsPeriod=1d&interval=12h&field=sum(session)") assert query.query_columns == ["sessions"] def test_groupby_query(): - query = _get_query_from_request( - MockRequest("statsPeriod=1d&interval=12h&field=sum(session)&groupBy=release") - ) + query = _make_query("statsPeriod=1d&interval=12h&field=sum(session)&groupBy=release") assert sorted(query.query_columns) == ["release", "sessions"] assert query.query_groupby == ["release"] def test_virtual_groupby_query(): - query = _get_query_from_request( - MockRequest("statsPeriod=1d&interval=12h&field=sum(session)&groupBy=session.status") - ) + query = _make_query("statsPeriod=1d&interval=12h&field=sum(session)&groupBy=session.status") assert sorted(query.query_columns) == [ "sessions", @@ -64,8 +59,8 @@ def test_virtual_groupby_query(): ] assert query.query_groupby == [] - query = _get_query_from_request( - MockRequest("statsPeriod=1d&interval=12h&field=count_unique(user)&groupBy=session.status") + query = _make_query( + "statsPeriod=1d&interval=12h&field=count_unique(user)&groupBy=session.status" ) assert sorted(query.query_columns) == [ @@ -81,7 +76,7 @@ def test_virtual_groupby_query(): def test_massage_simple_timeseries(): """A timeseries is filled up when it only receives partial data""" - query = _get_query_from_request(MockRequest("statsPeriod=1d&interval=6h&field=sum(session)")) + query = _make_query("statsPeriod=1d&interval=6h&field=sum(session)") result_totals = [{"sessions": 4}] # snuba returns the datetimes as strings for now result_timeseries = [ @@ -91,14 +86,13 @@ def test_massage_simple_timeseries(): expected_result = { "intervals": [ - "2020-12-17T06:00:00Z", "2020-12-17T12:00:00Z", "2020-12-17T18:00:00Z", "2020-12-18T00:00:00Z", "2020-12-18T06:00:00Z", ], "groups": [ - {"by": {}, "series": {"sum(session)": [0, 2, 0, 0, 2]}, "totals": {"sum(session)": 4}} + {"by": {}, "series": {"sum(session)": [2, 0, 0, 2]}, "totals": {"sum(session)": 4}} ], } @@ -109,9 +103,8 @@ def test_massage_simple_timeseries(): @freeze_time("2020-12-18T11:14:17.105Z") def test_massage_groupby_timeseries(): - query = _get_query_from_request( - MockRequest("statsPeriod=1d&interval=6h&field=sum(session)&groupBy=release") - ) + query = _make_query("statsPeriod=1d&interval=6h&field=sum(session)&groupBy=release") + result_totals = [ {"release": "test-example-release", "sessions": 4}, {"release": "test-example-release-2", "sessions": 1}, @@ -137,7 +130,6 @@ def test_massage_groupby_timeseries(): expected_result = { "intervals": [ - "2020-12-17T06:00:00Z", "2020-12-17T12:00:00Z", "2020-12-17T18:00:00Z", "2020-12-18T00:00:00Z", @@ -146,12 +138,12 @@ def test_massage_groupby_timeseries(): "groups": [ { "by": {"release": "test-example-release"}, - "series": {"sum(session)": [0, 2, 0, 0, 2]}, + "series": {"sum(session)": [2, 0, 0, 2]}, "totals": {"sum(session)": 4}, }, { "by": {"release": "test-example-release-2"}, - "series": {"sum(session)": [0, 0, 0, 0, 1]}, + "series": {"sum(session)": [0, 0, 0, 1]}, "totals": {"sum(session)": 1}, }, ], @@ -164,10 +156,8 @@ def test_massage_groupby_timeseries(): @freeze_time("2020-12-18T13:25:15.769Z") def test_massage_virtual_groupby_timeseries(): - query = _get_query_from_request( - MockRequest( - "statsPeriod=1d&interval=6h&field=sum(session)&field=count_unique(user)&groupBy=session.status" - ) + query = _make_query( + "statsPeriod=1d&interval=6h&field=sum(session)&field=count_unique(user)&groupBy=session.status" ) result_totals = [ { @@ -209,7 +199,6 @@ def test_massage_virtual_groupby_timeseries(): expected_result = { "intervals": [ - "2020-12-17T12:00:00Z", "2020-12-17T18:00:00Z", "2020-12-18T00:00:00Z", "2020-12-18T06:00:00Z", @@ -218,22 +207,22 @@ def test_massage_virtual_groupby_timeseries(): "groups": [ { "by": {"session.status": "abnormal"}, - "series": {"count_unique(user)": [0, 0, 0, 0, 0], "sum(session)": [0, 0, 0, 0, 0]}, + "series": {"count_unique(user)": [0, 0, 0, 0], "sum(session)": [0, 0, 0, 0]}, "totals": {"count_unique(user)": 0, "sum(session)": 0}, }, { "by": {"session.status": "crashed"}, - "series": {"count_unique(user)": [0, 0, 0, 0, 1], "sum(session)": [0, 0, 0, 0, 1]}, + "series": {"count_unique(user)": [0, 0, 0, 1], "sum(session)": [0, 0, 0, 1]}, "totals": {"count_unique(user)": 1, "sum(session)": 1}, }, { "by": {"session.status": "errored"}, - "series": {"count_unique(user)": [0, 0, 0, 0, 1], "sum(session)": [0, 0, 0, 0, 1]}, + "series": {"count_unique(user)": [0, 0, 0, 1], "sum(session)": [0, 0, 0, 1]}, "totals": {"count_unique(user)": 1, "sum(session)": 1}, }, { "by": {"session.status": "healthy"}, - "series": {"count_unique(user)": [0, 0, 0, 1, 0], "sum(session)": [0, 0, 0, 3, 2]}, + "series": {"count_unique(user)": [0, 0, 1, 0], "sum(session)": [0, 0, 3, 2]}, # while in one of the time slots, we have a healthy user, it is # the *same* user as the one experiencing a crash later on, # so in the *whole* time window, that one user is not counted as healthy, From a2a5dea2c5bc4a3e21f71ecb05730ceb6c5f768d Mon Sep 17 00:00:00 2001 From: Arpad Borsos Date: Wed, 13 Jan 2021 15:05:00 +0100 Subject: [PATCH 09/13] simplify query handling a bit --- src/sentry/api/endpoints/organization_sessions.py | 10 ++++++---- src/sentry/snuba/sessions_v2.py | 10 ++++------ tests/snuba/sessions/test_sessions_v2.py | 3 +++ 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/src/sentry/api/endpoints/organization_sessions.py b/src/sentry/api/endpoints/organization_sessions.py index 0b4f1255f7e899..5482a5d661954b 100644 --- a/src/sentry/api/endpoints/organization_sessions.py +++ b/src/sentry/api/endpoints/organization_sessions.py @@ -4,7 +4,7 @@ import sentry_sdk -from sentry.api.bases import OrganizationEventsV2EndpointBase +from sentry.api.bases import OrganizationEndpoint from sentry.snuba.sessions_v2 import ( QueryDefinition, run_sessions_query, @@ -12,7 +12,7 @@ ) -class OrganizationSessionsEndpoint(OrganizationEventsV2EndpointBase): +class OrganizationSessionsEndpoint(OrganizationEndpoint): def get(self, request, organization): # with self.handle_query_errors(): query = self.build_sessions_query(request, organization) @@ -26,6 +26,8 @@ def get(self, request, organization): def build_sessions_query(self, request, organization): with sentry_sdk.start_span(op="sessions.endpoint", description="build_sessions_query"): - params = self.get_snuba_params(request, organization, check_global_views=False) + # validate and default all `project` params. + projects = self.get_projects(request, organization) + project_ids = [p.id for p in projects] - return QueryDefinition(request.GET, params) + return QueryDefinition(request.GET, project_ids) diff --git a/src/sentry/snuba/sessions_v2.py b/src/sentry/snuba/sessions_v2.py index 52c7278154a6cc..82673bc99b867f 100644 --- a/src/sentry/snuba/sessions_v2.py +++ b/src/sentry/snuba/sessions_v2.py @@ -209,11 +209,11 @@ class QueryDefinition(object): `fields` and `groupby` definitions as [`ColumnDefinition`] objects. """ - def __init__(self, query, params): + def __init__(self, query, project_ids=None): # self.request = request # self.params = params - self.query = query.get("query") + self.query = query.get("query", "") raw_fields = query.getlist("field", []) raw_groupby = query.getlist("groupBy", []) @@ -238,13 +238,11 @@ def __init__(self, query, params): query_groupby.update(groupby.get_snuba_groupby()) self.query_groupby = list(query_groupby) - params["start"] = start - params["end"] = end + params = {"project_id": project_ids or []} snuba_filter = get_filter(self.query, params) self.aggregations = snuba_filter.aggregations self.conditions = snuba_filter.conditions - self.filter_keys = snuba_filter.filter_keys @@ -367,7 +365,7 @@ def make_totals(totals, group): groups.append(group) return { - # "query": query.query, + "query": query.query, "intervals": timestamps, "groups": groups, } diff --git a/tests/snuba/sessions/test_sessions_v2.py b/tests/snuba/sessions/test_sessions_v2.py index 4d640b22923866..ed05a2b59ee375 100644 --- a/tests/snuba/sessions/test_sessions_v2.py +++ b/tests/snuba/sessions/test_sessions_v2.py @@ -85,6 +85,7 @@ def test_massage_simple_timeseries(): ] expected_result = { + "query": "", "intervals": [ "2020-12-17T12:00:00Z", "2020-12-17T18:00:00Z", @@ -129,6 +130,7 @@ def test_massage_groupby_timeseries(): ] expected_result = { + "query": "", "intervals": [ "2020-12-17T12:00:00Z", "2020-12-17T18:00:00Z", @@ -198,6 +200,7 @@ def test_massage_virtual_groupby_timeseries(): ] expected_result = { + "query": "", "intervals": [ "2020-12-17T18:00:00Z", "2020-12-18T00:00:00Z", From b65b134e98f3f3338e70d19056729c743b29bd81 Mon Sep 17 00:00:00 2001 From: Arpad Borsos Date: Thu, 14 Jan 2021 15:34:15 +0100 Subject: [PATCH 10/13] add some endpoint tests --- src/sentry/snuba/sessions_v2.py | 3 - .../endpoints/test_organization_sessions.py | 411 ++++++++++++++++++ 2 files changed, 411 insertions(+), 3 deletions(-) create mode 100644 tests/snuba/api/endpoints/test_organization_sessions.py diff --git a/src/sentry/snuba/sessions_v2.py b/src/sentry/snuba/sessions_v2.py index 82673bc99b867f..3b485e34d8e5df 100644 --- a/src/sentry/snuba/sessions_v2.py +++ b/src/sentry/snuba/sessions_v2.py @@ -210,9 +210,6 @@ class QueryDefinition(object): """ def __init__(self, query, project_ids=None): - # self.request = request - # self.params = params - self.query = query.get("query", "") raw_fields = query.getlist("field", []) raw_groupby = query.getlist("groupBy", []) diff --git a/tests/snuba/api/endpoints/test_organization_sessions.py b/tests/snuba/api/endpoints/test_organization_sessions.py new file mode 100644 index 00000000000000..e59ebb0ff8129d --- /dev/null +++ b/tests/snuba/api/endpoints/test_organization_sessions.py @@ -0,0 +1,411 @@ +from __future__ import absolute_import + +import datetime +import pytz + +from uuid import uuid4 +from freezegun import freeze_time + +from django.core.urlresolvers import reverse + +from sentry.testutils import APITestCase, SnubaTestCase +from sentry.utils.dates import to_timestamp + + +def result_sorted(result): + """sort the groups of the results array by the `by` object, ensuring a stable order""" + + def stable_dict(d): + return tuple(sorted(d.items(), key=lambda t: t[0])) + + result["groups"].sort(key=lambda group: stable_dict(group["by"])) + return result + + +class OrganizationSessionsEndpointTest(APITestCase, SnubaTestCase): + def setUp(self): + super(OrganizationSessionsEndpointTest, self).setUp() + self.setup_fixture() + + def setup_fixture(self): + self.timestamp = to_timestamp(datetime.datetime(2021, 1, 14, 12, 27, 28, tzinfo=pytz.utc)) + self.received = self.timestamp + self.session_started = self.timestamp // 60 * 60 + + self.organization1 = self.organization + self.organization2 = self.create_organization() + self.project1 = self.project + self.project2 = self.create_project() + self.project3 = self.create_project() + self.project4 = self.create_project(organization=self.organization2) + + template = { + "distinct_id": "00000000-0000-0000-0000-000000000000", + "status": "exited", + "seq": 0, + "release": "foo@1.0.0", + "environment": "production", + "retention_days": 90, + "duration": None, + "errors": 0, + "started": self.session_started, + "received": self.received, + } + + def make_session(project, **kwargs): + return dict( + template, + session_id=uuid4().hex, + org_id=project.organization_id, + project_id=project.id, + **kwargs + ) + + self.store_session(make_session(self.project1)) + self.store_session(make_session(self.project1, release="foo@1.1.0")) + self.store_session(make_session(self.project1, started=self.session_started - 60 * 60)) + self.store_session(make_session(self.project1, started=self.session_started - 12 * 60 * 60)) + self.store_session(make_session(self.project2, status="crashed")) + self.store_session(make_session(self.project2, environment="development")) + self.store_session(make_session(self.project3, errors=1)) + self.store_session( + make_session( + self.project3, + distinct_id="39887d89-13b2-4c84-8c23-5d13d2102664", + started=self.session_started - 60 * 60, + ) + ) + self.store_session( + make_session( + self.project3, distinct_id="39887d89-13b2-4c84-8c23-5d13d2102664", errors=1 + ) + ) + self.store_session(make_session(self.project4)) + + def do_request(self, query): + self.login_as(user=self.user) + url = reverse( + "sentry-api-0-organization-sessions", + kwargs={"organization_slug": self.organization.slug}, + ) + return self.client.get(url, query, format="json") + + def test_empty_request(self): + response = self.do_request({}) + + # TODO: making an empty request throws currently + assert response.status_code == 500, response.content + assert response.data["detail"] == "Internal Error" + + def test_inaccessible_project(self): + response = self.do_request({"project": [self.project4.id]}) + + assert response.status_code == 403, response.content + assert response.data == {"detail": "You do not have permission to perform this action."} + + def test_unknown_field(self): + response = self.do_request({"field": ["summ(sessin)"]}) + + # TODO: throws currently + assert response.status_code == 500, response.content + assert response.data == {"detail": "Internal Error", "errorId": None} + + def test_unknown_groupby(self): + response = self.do_request({"field": ["sum(session)"], "groupBy": ["envriomnent"]}) + + # TODO: throws currently + assert response.status_code == 500, response.content + assert response.data == {"detail": "Internal Error", "errorId": None} + + def test_too_many_points(self): + # TODO: looks like this is well within the range of valid points + return + # default statsPeriod is 90d + response = self.do_request({"field": ["sum(session)"], "interval": "1h"}) + + assert response.status_code == 500, response.content + assert response.data == {} + + @freeze_time("2021-01-14T12:27:28.303Z") + def test_timeseries_interval(self): + response = self.do_request( + {"statsPeriod": "1d", "interval": "1d", "field": ["sum(session)"]} + ) + + assert response.status_code == 200, response.content + assert result_sorted(response.data) == { + "query": "", + "intervals": ["2021-01-14T00:00:00Z"], + "groups": [{"by": {}, "series": {"sum(session)": [9]}, "totals": {"sum(session)": 9}}], + } + + response = self.do_request( + {"statsPeriod": "1d", "interval": "6h", "field": ["sum(session)"]} + ) + + assert response.status_code == 200, response.content + assert result_sorted(response.data) == { + "query": "", + "intervals": [ + "2021-01-13T18:00:00Z", + "2021-01-14T00:00:00Z", + "2021-01-14T06:00:00Z", + "2021-01-14T12:00:00Z", + ], + "groups": [ + {"by": {}, "series": {"sum(session)": [0, 1, 2, 6]}, "totals": {"sum(session)": 9}} + ], + } + + @freeze_time("2021-01-14T12:27:28.303Z") + def test_minimum_interval(self): + # smallest interval is 1h + response = self.do_request( + {"statsPeriod": "2h", "interval": "5m", "field": ["sum(session)"]} + ) + + assert response.status_code == 200, response.content + assert result_sorted(response.data) == { + "query": "", + "intervals": ["2021-01-14T11:00:00Z", "2021-01-14T12:00:00Z"], + "groups": [ + {"by": {}, "series": {"sum(session)": [2, 6]}, "totals": {"sum(session)": 8}} + ], + } + + @freeze_time("2021-01-14T12:27:28.303Z") + def test_filter_projects(self): + response = self.do_request( + { + "statsPeriod": "1d", + "interval": "1d", + "field": ["sum(session)"], + "project": [self.project2.id, self.project3.id], + } + ) + + assert response.status_code == 200, response.content + assert result_sorted(response.data)["groups"] == [ + {"by": {}, "series": {"sum(session)": [5]}, "totals": {"sum(session)": 5}} + ] + + @freeze_time("2021-01-14T12:27:28.303Z") + def test_filter_environment(self): + response = self.do_request( + { + "statsPeriod": "1d", + "interval": "1d", + "field": ["sum(session)"], + "query": "environment:development", + } + ) + + assert response.status_code == 200, response.content + assert result_sorted(response.data)["groups"] == [ + {"by": {}, "series": {"sum(session)": [1]}, "totals": {"sum(session)": 1}} + ] + + @freeze_time("2021-01-14T12:27:28.303Z") + def test_filter_release(self): + response = self.do_request( + { + "statsPeriod": "1d", + "interval": "1d", + "field": ["sum(session)"], + "query": "release:foo@1.1.0", + } + ) + + assert response.status_code == 200, response.content + assert result_sorted(response.data)["groups"] == [ + {"by": {}, "series": {"sum(session)": [1]}, "totals": {"sum(session)": 1}} + ] + + @freeze_time("2021-01-14T12:27:28.303Z") + def test_groupby_project(self): + response = self.do_request( + { + "statsPeriod": "1d", + "interval": "1d", + "field": ["sum(session)"], + "groupBy": ["project"], + } + ) + + assert response.status_code == 200, response.content + assert result_sorted(response.data)["groups"] == [ + { + "by": {"project": self.project1.id}, + "series": {"sum(session)": [4]}, + "totals": {"sum(session)": 4}, + }, + { + "by": {"project": self.project2.id}, + "series": {"sum(session)": [2]}, + "totals": {"sum(session)": 2}, + }, + { + "by": {"project": self.project3.id}, + "series": {"sum(session)": [3]}, + "totals": {"sum(session)": 3}, + }, + ] + + @freeze_time("2021-01-14T12:27:28.303Z") + def test_groupby_environment(self): + response = self.do_request( + { + "statsPeriod": "1d", + "interval": "1d", + "field": ["sum(session)"], + "groupBy": ["environment"], + } + ) + + assert response.status_code == 200, response.content + assert result_sorted(response.data)["groups"] == [ + { + "by": {"environment": "development"}, + "series": {"sum(session)": [1]}, + "totals": {"sum(session)": 1}, + }, + { + "by": {"environment": "production"}, + "series": {"sum(session)": [8]}, + "totals": {"sum(session)": 8}, + }, + ] + + @freeze_time("2021-01-14T12:27:28.303Z") + def test_groupby_release(self): + response = self.do_request( + { + "statsPeriod": "1d", + "interval": "1d", + "field": ["sum(session)"], + "groupBy": ["release"], + } + ) + + assert response.status_code == 200, response.content + assert result_sorted(response.data)["groups"] == [ + { + "by": {"release": "foo@1.0.0"}, + "series": {"sum(session)": [8]}, + "totals": {"sum(session)": 8}, + }, + { + "by": {"release": "foo@1.1.0"}, + "series": {"sum(session)": [1]}, + "totals": {"sum(session)": 1}, + }, + ] + + @freeze_time("2021-01-14T12:27:28.303Z") + def test_groupby_status(self): + response = self.do_request( + { + "statsPeriod": "1d", + "interval": "1d", + "field": ["sum(session)"], + "groupBy": ["session.status"], + } + ) + + assert response.status_code == 200, response.content + assert result_sorted(response.data)["groups"] == [ + { + "by": {"session.status": "abnormal"}, + "series": {"sum(session)": [0]}, + "totals": {"sum(session)": 0}, + }, + { + "by": {"session.status": "crashed"}, + "series": {"sum(session)": [1]}, + "totals": {"sum(session)": 1}, + }, + { + "by": {"session.status": "errored"}, + "series": {"sum(session)": [3]}, + "totals": {"sum(session)": 3}, + }, + { + "by": {"session.status": "healthy"}, + "series": {"sum(session)": [6]}, + "totals": {"sum(session)": 6}, + }, + ] + + @freeze_time("2021-01-14T12:27:28.303Z") + def test_groupby_cross(self): + response = self.do_request( + { + "statsPeriod": "1d", + "interval": "1d", + "field": ["sum(session)"], + "groupBy": ["release", "environment"], + } + ) + + assert response.status_code == 200, response.content + assert result_sorted(response.data)["groups"] == [ + { + "by": {"environment": "development", "release": "foo@1.0.0"}, + "series": {"sum(session)": [1]}, + "totals": {"sum(session)": 1}, + }, + { + "by": {"environment": "production", "release": "foo@1.0.0"}, + "series": {"sum(session)": [7]}, + "totals": {"sum(session)": 7}, + }, + { + "by": {"environment": "production", "release": "foo@1.1.0"}, + "series": {"sum(session)": [1]}, + "totals": {"sum(session)": 1}, + }, + ] + + @freeze_time("2021-01-14T12:27:28.303Z") + def test_users_groupby(self): + response = self.do_request( + {"statsPeriod": "1d", "interval": "1d", "field": ["count_unique(user)"]} + ) + + assert response.status_code == 200, response.content + assert result_sorted(response.data)["groups"] == [ + {"by": {}, "series": {"count_unique(user)": [1]}, "totals": {"count_unique(user)": 1}} + ] + + response = self.do_request( + { + "statsPeriod": "1d", + "interval": "1d", + "field": ["count_unique(user)"], + "groupBy": ["session.status"], + } + ) + + assert response.status_code == 200, response.content + assert result_sorted(response.data)["groups"] == [ + { + "by": {"session.status": "abnormal"}, + "series": {"count_unique(user)": [0]}, + "totals": {"count_unique(user)": 0}, + }, + { + "by": {"session.status": "crashed"}, + "series": {"count_unique(user)": [0]}, + "totals": {"count_unique(user)": 0}, + }, + { + "by": {"session.status": "errored"}, + "series": {"count_unique(user)": [1]}, + "totals": {"count_unique(user)": 1}, + }, + { + "by": {"session.status": "healthy"}, + "series": {"count_unique(user)": [0]}, + "totals": {"count_unique(user)": 0}, + }, + ] From 2504a5186f1b397a51039eb22fa3b0a86103260c Mon Sep 17 00:00:00 2001 From: Arpad Borsos Date: Fri, 15 Jan 2021 12:45:47 +0100 Subject: [PATCH 11/13] implement proper error handling for field mismatches --- .../api/endpoints/organization_sessions.py | 47 +++++++++++++------ src/sentry/snuba/sessions_v2.py | 21 +++++++-- .../endpoints/test_organization_sessions.py | 17 +++---- 3 files changed, 57 insertions(+), 28 deletions(-) diff --git a/src/sentry/api/endpoints/organization_sessions.py b/src/sentry/api/endpoints/organization_sessions.py index 5482a5d661954b..4d685a758d2326 100644 --- a/src/sentry/api/endpoints/organization_sessions.py +++ b/src/sentry/api/endpoints/organization_sessions.py @@ -1,33 +1,50 @@ from __future__ import absolute_import +from contextlib import contextmanager + from rest_framework.response import Response +from rest_framework.exceptions import ParseError +import six import sentry_sdk -from sentry.api.bases import OrganizationEndpoint +from sentry.api.bases import OrganizationEventsEndpointBase from sentry.snuba.sessions_v2 import ( + InvalidField, QueryDefinition, run_sessions_query, massage_sessions_result, ) -class OrganizationSessionsEndpoint(OrganizationEndpoint): +# NOTE: this currently extends `OrganizationEventsEndpointBase` for `handle_query_errors` only, which should ideally be decoupled from the base class. +class OrganizationSessionsEndpoint(OrganizationEventsEndpointBase): def get(self, request, organization): - # with self.handle_query_errors(): - query = self.build_sessions_query(request, organization) + with self.handle_query_errors(): + with sentry_sdk.start_span(op="sessions.endpoint", description="build_sessions_query"): + query = self.build_sessions_query(request, organization) - with sentry_sdk.start_span(op="sessions.endpoint", description="run_sessions_query"): - result_totals, result_timeseries = run_sessions_query(query) + with sentry_sdk.start_span(op="sessions.endpoint", description="run_sessions_query"): + result_totals, result_timeseries = run_sessions_query(query) - with sentry_sdk.start_span(op="sessions.endpoint", description="massage_sessions_result"): - result = massage_sessions_result(query, result_totals, result_timeseries) - return Response(result, status=200) + with sentry_sdk.start_span( + op="sessions.endpoint", description="massage_sessions_result" + ): + result = massage_sessions_result(query, result_totals, result_timeseries) + return Response(result, status=200) def build_sessions_query(self, request, organization): - with sentry_sdk.start_span(op="sessions.endpoint", description="build_sessions_query"): - # validate and default all `project` params. - projects = self.get_projects(request, organization) - project_ids = [p.id for p in projects] - - return QueryDefinition(request.GET, project_ids) + # validate and default all `project` params. + projects = self.get_projects(request, organization) + project_ids = [p.id for p in projects] + + return QueryDefinition(request.GET, project_ids) + + @contextmanager + def handle_query_errors(self): + try: + # TODO: this context manager should be decoupled from `OrganizationEventsEndpointBase`? + with super().handle_query_errors(): + yield + except InvalidField as error: + raise ParseError(detail=six.text_type(error)) diff --git a/src/sentry/snuba/sessions_v2.py b/src/sentry/snuba/sessions_v2.py index 3b485e34d8e5df..17dafda0aa0236 100644 --- a/src/sentry/snuba/sessions_v2.py +++ b/src/sentry/snuba/sessions_v2.py @@ -202,6 +202,10 @@ def get_keys_for_row(self, row): } +class InvalidField(Exception): + pass + + class QueryDefinition(object): """ This is the definition of the query the user wants to execute. @@ -214,9 +218,20 @@ def __init__(self, query, project_ids=None): raw_fields = query.getlist("field", []) raw_groupby = query.getlist("groupBy", []) - # TODO: maybe show a proper error message for unknown fields/groupby - self.fields = {field: COLUMN_MAP.get(field) for field in raw_fields} - self.groupby = [GROUPBY_MAP.get(field) for field in raw_groupby] + if len(raw_fields) == 0: + raise InvalidField(u'Request is missing a "field"') + + self.fields = {} + for key in raw_fields: + if key not in COLUMN_MAP: + raise InvalidField(u'Invalid field: "{}"'.format(key)) + self.fields[key] = COLUMN_MAP[key] + + self.groupby = [] + for key in raw_groupby: + if key not in GROUPBY_MAP: + raise InvalidField(u'Invalid groupBy: "{}"'.format(key)) + self.groupby.append(GROUPBY_MAP[key]) start, end, rollup = get_date_range_rollup_from_params(query, "1h", round_range=True) self.rollup = rollup diff --git a/tests/snuba/api/endpoints/test_organization_sessions.py b/tests/snuba/api/endpoints/test_organization_sessions.py index e59ebb0ff8129d..f3b12d22bb1d76 100644 --- a/tests/snuba/api/endpoints/test_organization_sessions.py +++ b/tests/snuba/api/endpoints/test_organization_sessions.py @@ -93,9 +93,8 @@ def do_request(self, query): def test_empty_request(self): response = self.do_request({}) - # TODO: making an empty request throws currently - assert response.status_code == 500, response.content - assert response.data["detail"] == "Internal Error" + assert response.status_code == 400, response.content + assert response.data == {"detail": 'Request is missing a "field"'} def test_inaccessible_project(self): response = self.do_request({"project": [self.project4.id]}) @@ -106,16 +105,14 @@ def test_inaccessible_project(self): def test_unknown_field(self): response = self.do_request({"field": ["summ(sessin)"]}) - # TODO: throws currently - assert response.status_code == 500, response.content - assert response.data == {"detail": "Internal Error", "errorId": None} + assert response.status_code == 400, response.content + assert response.data == {"detail": 'Invalid field: "summ(sessin)"'} def test_unknown_groupby(self): response = self.do_request({"field": ["sum(session)"], "groupBy": ["envriomnent"]}) - # TODO: throws currently - assert response.status_code == 500, response.content - assert response.data == {"detail": "Internal Error", "errorId": None} + assert response.status_code == 400, response.content + assert response.data == {"detail": 'Invalid groupBy: "envriomnent"'} def test_too_many_points(self): # TODO: looks like this is well within the range of valid points @@ -123,7 +120,7 @@ def test_too_many_points(self): # default statsPeriod is 90d response = self.do_request({"field": ["sum(session)"], "interval": "1h"}) - assert response.status_code == 500, response.content + assert response.status_code == 400, response.content assert response.data == {} @freeze_time("2021-01-14T12:27:28.303Z") From b8914a85c843a3876ad135b9b1d6cda5e5cb1d1f Mon Sep 17 00:00:00 2001 From: Arpad Borsos Date: Fri, 15 Jan 2021 13:50:46 +0100 Subject: [PATCH 12/13] add missing field to test --- tests/snuba/sessions/test_sessions_v2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/snuba/sessions/test_sessions_v2.py b/tests/snuba/sessions/test_sessions_v2.py index ed05a2b59ee375..995a7fe6f9c9d1 100644 --- a/tests/snuba/sessions/test_sessions_v2.py +++ b/tests/snuba/sessions/test_sessions_v2.py @@ -27,7 +27,7 @@ def stable_dict(d): @freeze_time("2020-12-18T11:14:17.105Z") def test_timestamps(): - query = _make_query("statsPeriod=1d&interval=12h") + query = _make_query("statsPeriod=1d&interval=12h&field=sum(session)") expected_timestamps = ["2020-12-17T12:00:00Z", "2020-12-18T00:00:00Z"] actual_timestamps = _get_timestamps(query) From 4ce96570c588a6b307ef6171cbc2bcb8beaff297 Mon Sep 17 00:00:00 2001 From: Arpad Borsos Date: Fri, 15 Jan 2021 14:08:11 +0100 Subject: [PATCH 13/13] super py2.7 --- src/sentry/api/endpoints/organization_sessions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sentry/api/endpoints/organization_sessions.py b/src/sentry/api/endpoints/organization_sessions.py index 4d685a758d2326..640eea68951517 100644 --- a/src/sentry/api/endpoints/organization_sessions.py +++ b/src/sentry/api/endpoints/organization_sessions.py @@ -44,7 +44,7 @@ def build_sessions_query(self, request, organization): def handle_query_errors(self): try: # TODO: this context manager should be decoupled from `OrganizationEventsEndpointBase`? - with super().handle_query_errors(): + with super(OrganizationSessionsEndpoint, self).handle_query_errors(): yield except InvalidField as error: raise ParseError(detail=six.text_type(error))