From 2b0199b52e345ffc62137e619e05a2a7692bb1c1 Mon Sep 17 00:00:00 2001 From: Cyril Tovena Date: Tue, 21 May 2019 17:35:46 -0400 Subject: [PATCH 1/9] LogQL: Vector and Range Vector Aggregation. - adds avg,min,max,top,bottomk,stddev,stdvar,count - updates api documentation - adds tests Improve yacc & go lexer to understand duration Remove support for regexp in all queries Clean up querier and logselector --- cmd/logcli/client.go | 16 +- cmd/logcli/main.go | 1 - docs/api.md | 176 +++++++++++ docs/usage.md | 61 +++- pkg/ingester/ingester.go | 2 +- pkg/ingester/ingester_test.go | 24 +- pkg/ingester/instance.go | 42 ++- pkg/ingester/tailer.go | 49 ++- pkg/iter/iterator.go | 90 ++++++ pkg/iter/iterator_test.go | 65 +++- pkg/logproto/logproto.pb.go | 275 +++++------------ pkg/logproto/logproto.proto | 10 +- pkg/logql/ast.go | 224 ++++++++++++-- pkg/logql/ast_test.go | 28 ++ pkg/logql/engine.go | 502 +++++++++++++++++++++++++++++++ pkg/logql/expr.y | 111 +++++-- pkg/logql/expr.y.go | 417 ++++++++++++++++++++----- pkg/logql/parser.go | 99 ++++-- pkg/logql/parser_test.go | 293 ++++++++++++++++++ pkg/logql/range_vector.go | 129 ++++++++ pkg/logql/range_vector_test.go | 138 +++++++++ pkg/logql/vector.go | 65 ++++ pkg/loki/modules.go | 4 +- pkg/querier/http.go | 139 ++++++++- pkg/querier/querier.go | 75 ++--- pkg/querier/querier_mock_test.go | 3 +- pkg/querier/querier_test.go | 14 +- pkg/storage/store.go | 60 ++-- pkg/storage/store_test.go | 18 +- pkg/storage/util_test.go | 6 +- pkg/util/conv.go | 14 +- 31 files changed, 2624 insertions(+), 526 deletions(-) create mode 100644 pkg/logql/ast_test.go create mode 100644 pkg/logql/engine.go create mode 100644 pkg/logql/range_vector.go create mode 100644 pkg/logql/range_vector_test.go create mode 100644 pkg/logql/vector.go diff --git a/cmd/logcli/client.go b/cmd/logcli/client.go index 0984ba453899..777b54498b12 100644 --- a/cmd/logcli/client.go +++ b/cmd/logcli/client.go @@ -18,20 +18,19 @@ import ( ) const ( - queryPath = "/api/prom/query?query=%s&limit=%d&start=%d&end=%d&direction=%s®exp=%s" + queryPath = "/api/prom/query?query=%s&limit=%d&start=%d&end=%d&direction=%s" labelsPath = "/api/prom/label" labelValuesPath = "/api/prom/label/%s/values" - tailPath = "/api/prom/tail?query=%s®exp=%s&delay_for=%d&limit=%d&start=%d" + tailPath = "/api/prom/tail?query=%s&delay_for=%d&limit=%d&start=%d" ) func query(from, through time.Time, direction logproto.Direction) (*logproto.QueryResponse, error) { path := fmt.Sprintf(queryPath, - url.QueryEscape(*queryStr), // query - *limit, // limit - from.UnixNano(), // start - through.UnixNano(), // end - direction.String(), // direction - url.QueryEscape(*regexpStr), // regexp + url.QueryEscape(*queryStr), // query + *limit, // limit + from.UnixNano(), // start + through.UnixNano(), // end + direction.String(), // direction ) var resp logproto.QueryResponse @@ -113,7 +112,6 @@ func doRequest(path string, out interface{}) error { func liveTailQueryConn() (*websocket.Conn, error) { path := fmt.Sprintf(tailPath, url.QueryEscape(*queryStr), // query - url.QueryEscape(*regexpStr), // regexp *delayFor, // delay_for *limit, // limit getStart(time.Now()).UnixNano(), // start diff --git a/cmd/logcli/main.go b/cmd/logcli/main.go index 7ab7c443e55e..6c2b48fbf280 100644 --- a/cmd/logcli/main.go +++ b/cmd/logcli/main.go @@ -27,7 +27,6 @@ var ( queryCmd = app.Command("query", "Run a LogQL query.") queryStr = queryCmd.Arg("query", "eg '{foo=\"bar\",baz=\"blip\"}'").Required().String() - regexpStr = queryCmd.Arg("regex", "").String() limit = queryCmd.Flag("limit", "Limit on number of entries to print.").Default("30").Int() since = queryCmd.Flag("since", "Lookback window.").Default("1h").Duration() from = queryCmd.Flag("from", "Start looking for logs at this absolute time (inclusive)").String() diff --git a/docs/api.md b/docs/api.md index 2432f307dfe0..3f3754f2f089 100644 --- a/docs/api.md +++ b/docs/api.md @@ -20,6 +20,180 @@ The Loki server has the following API endpoints (_Note:_ Authentication is out o } ] } + + ``` + +- `GET /api/v1/query` + + For doing instant queries at a single point in time, accepts the following parameters in the query-string: + + - `query`: a logQL query + - `limit`: max number of entries to return (not used for sample expression) + - `time`: the evaluation time for the query, as a nanosecond Unix epoch (nanoseconds since 1970). Default is always now. + - `direction`: `forward` or `backward`, useful when specifying a limit. Default is backward. + + Loki needs to query the index store in order to find log streams for particular labels and the store is spread out by time, + so you need to specify the time and labels accordingly. Querying a long time into the history will cause additional + load to the index server and make the query slower. + + Responses looks like this: + + ```json + { + "resultType": "vector" | "streams", + "result": + } + ``` + + Examples: + + ```bash + $ curl -G -s "http://localhost:3100/api/v1/query" --data-urlencode 'query=sum(rate({job="varlogs"}[10m])) by (level)' | jq + { + "resultType": "vector", + "result": [ + { + "metric": {}, + "value": [ + 1559848867745737, + "1267.1266666666666" + ] + }, + { + "metric": { + "level": "warn" + }, + "value": [ + 1559848867745737, + "37.77166666666667" + ] + }, + { + "metric": { + "level": "info" + }, + "value": [ + 1559848867745737, + "37.69" + ] + } + ] + } + ``` + + ```bash + curl -G -s "http://localhost:3100/api/v1/query" --data-urlencode 'query={job="varlogs"}' | jq + { + "resultType": "streams", + "result": [ + { + "labels": "{filename=\"/var/log/myproject.log\", job=\"varlogs\", level=\"info\"}", + "entries": [ + { + "ts": "2019-06-06T19:25:41.972739Z", + "line": "foo" + }, + { + "ts": "2019-06-06T19:25:41.972722Z", + "line": "bar" + } + ] + } + ] + ``` + +- `GET /api/v1/query_range` + + For doing queries over a range of time, accepts the following parameters in the query-string: + + - `query`: a logQL query + - `limit`: max number of entries to return (not used for sample expression) + - `start`: the start time for the query, as a nanosecond Unix epoch (nanoseconds since 1970). Default is always one hour ago. + - `end`: the end time for the query, as a nanosecond Unix epoch (nanoseconds since 1970). Default is always now. + - `step`: query resolution step width in seconds. Default 1 second. + - `direction`: `forward` or `backward`, useful when specifying a limit. Default is backward. + + Loki needs to query the index store in order to find log streams for particular labels and the store is spread out by time, + so you need to specify the time and labels accordingly. Querying a long time into the history will cause additional + load to the index server and make the query slower. + + Responses looks like this: + + ```json + { + "resultType": "matrix" | "streams", + "result": + } + ``` + + Examples: + + ```bash + $ curl -G -s "http://localhost:3100/api/v1/query_range" --data-urlencode 'query=sum(rate({job="varlogs"}[10m])) by (level)' --data-urlencode 'step=300' | jq + { + "resultType": "matrix", + "result": [ + { + "metric": { + "level": "info" + }, + "values": [ + [ + 1559848958663735, + "137.95" + ], + [ + 1559849258663735, + "467.115" + ], + [ + 1559849558663735, + "658.8516666666667" + ] + ] + }, + { + "metric": { + "level": "warn" + }, + "values": [ + [ + 1559848958663735, + "137.27833333333334" + ], + [ + 1559849258663735, + "467.69" + ], + [ + 1559849558663735, + "660.6933333333334" + ] + ] + } + ] + } + ``` + + ```bash + curl -G -s "http://localhost:3100/api/v1/query_range" --data-urlencode 'query={job="varlogs"}' | jq + { + "resultType": "streams", + "result": [ + { + "labels": "{filename=\"/var/log/myproject.log\", job=\"varlogs\", level=\"info\"}", + "entries": [ + { + "ts": "2019-06-06T19:25:41.972739Z", + "line": "foo" + }, + { + "ts": "2019-06-06T19:25:41.972722Z", + "line": "bar" + } + ] + } + ] ``` - `GET /api/prom/query` @@ -37,6 +211,8 @@ The Loki server has the following API endpoints (_Note:_ Authentication is out o so you need to specify the start and end labels accordingly. Querying a long time into the history will cause additional load to the index server and make the query slower. + > This endpoint doesn't accept [sample query](./usage.md#counting-logs). + Responses looks like this: ```json diff --git a/docs/usage.md b/docs/usage.md index 821e10c9c9f9..44d97dcb6813 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -21,7 +21,7 @@ Read more about the Explore feature in the [Grafana docs](http://docs.grafana.or ## Searching with Labels and Distributed Grep -A log query consists of two parts: **log stream selector**, and a **filter expression**. For performance reasons you need to start by choosing a set of log streams using a Prometheus-style log stream selector. +A log filter query consists of two parts: **log stream selector**, and a **filter expression**. For performance reasons you need to start by choosing a set of log streams using a Prometheus-style log stream selector. The log stream selector will reduce the number of log streams to a manageable volume and then the regex search expression is used to do a distributed grep over those log streams. @@ -76,3 +76,62 @@ The query language is still under development to support more features, e.g.,: - Number extraction for timeseries based on number in log messages - JSON accessors for filtering of JSON-structured logs - Context (like `grep -C n`) + +## Counting logs + +Loki's LogQL support sample expression allowing to count entries per stream after the regex filtering stage. + +### Range Vector aggregation + +The language shares the same [range vector](https://prometheus.io/docs/prometheus/latest/querying/basics/#range-vector-selectors) concept from Prometheus, except that the selected range of samples contains a value of one for each log entry. You can then apply an aggregation over the selected range to transform it into an instant vector. + +`rate` calculates the number of entries per second and `count_over_time` count of entries for the each log stream within the range. + +In this example, we count all the log lines we have recorded within the last 5min for the mysql job. + +> `count_over_time({job="mysql"}[5m])` + +A range vector aggregation can also be applied to a [Filter Expression](#filter-expression), allowing you to select only matching log entries. + +> `rate( ( {job="mysql"} |= "error" != "timeout)[10s] ) )` + +The query above will compute the per second rate of all errors except those containing `timeout` within the last 10 seconds. + +You can then use aggregation operators over the range vector aggregation. + +### Aggregation operators + +Like [PromQL](https://prometheus.io/docs/prometheus/latest/querying/operators/#aggregation-operators), Loki's LogQL support a subset of built-in aggregation operators that can be used to aggregate the element of a single vector, resulting in a new vector of fewer elements with aggregated values: + +- `sum` (calculate sum over dimensions) +- `min` (select minimum over dimensions) +- `max` (select maximum over dimensions) +- `avg` (calculate the average over dimensions) +- `stddev` (calculate population standard deviation over dimensions) +- `stdvar` (calculate population standard variance over dimensions) +- `count` (count number of elements in the vector) +- `bottomk` (smallest k elements by sample value) +- `topk` (largest k elements by sample value) + +These operators can either be used to aggregate over all label dimensions or preserve distinct dimensions by including a without or by clause. + +> `([parameter,] ) [without|by (