Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
359 lines (319 sloc) 13.4 KB
%% @doc Elli middleware for collecting stats via Prometheus.
%% @author Eric Bailey
%% @author Ilya Khaprov
%% @version 0.1.1
%% @reference <a href="https://prometheus.io">Prometheus</a>
%% @copyright 2016 elli-lib team
-module(elli_prometheus).
-author("Eric Bailey").
-author("Ilya Khaprov").
-behaviour(elli_handler).
%% elli_handler callbacks
-export([handle/2, handle_event/3]).
%% Service Metric
-define(UP, elli_up).
%% Metrics for successful requests
-define(TOTAL, http_requests_total).
-define(REQUEST_DURATION, http_request_duration_microseconds).
-define(REQUEST_HEADERS_DURATION, http_request_headers_microseconds).
-define(REQUEST_BODY_DURATION, http_request_body_microseconds).
-define(REQUEST_USER_DURATION, http_request_user_microseconds).
-define(RESPONSE_SEND_DURATION, http_response_send_microseconds).
-define(RESPONSE_SIZE, http_response_size_bytes).
-define(RESPONSE_HEADERS_SIZE, http_response_headers_size_bytes).
-define(RESPONSE_BODY_SIZE, http_response_body_size_bytes).
%% Metrics for failed requests
-define(FAILED_TOTAL, http_requests_failed_total).
-define(BAD_REQUEST_TOTAL, http_bad_requests_total).
-define(CLIENT_CLOSED_TOTAL, http_client_closed_total).
-define(CLIENT_TIMEOUT_TOTAL, http_client_timeout_total).
-define(SCRAPE_DURATION, telemetry_scrape_duration_seconds).
-define(SCRAPE_SIZE, telemetry_scrape_size_bytes).
-define(SCRAPE_ENCODED_SIZE, telemetry_scrape_encoded_size_bytes).
%%%===================================================================
%%% elli_handler callbacks
%%%===================================================================
%% @doc Handle requests to `/metrics' and ignore all others.
%% TODO: Describe format.
%% TODO: Add links to Prometheus and Prometheus.erl docs.
handle(Req, _Config) ->
Path = elli_prometheus_config:path(),
case {elli_request:method(Req), elli_request:raw_path(Req)} of
{'GET', Path} -> format_metrics(Req);
_ -> ignore
end.
handle_event(request_complete, Args, Config) ->
handle_full_response(request_complete, Args, Config);
handle_event(chunk_complete, Args, Config) ->
handle_full_response(chunk_complete, Args, Config);
handle_event(request_closed, _, _) ->
count_failed_request(request_closed);
handle_event(request_timeout, _, _) ->
count_failed_request(request_timeout);
handle_event(request_parse_error, _, _) ->
count_failed_request(request_parse_error);
handle_event(client_closed, [RequestPart], _) ->
prometheus_counter:inc(?CLIENT_CLOSED_TOTAL, [RequestPart]),
count_failed_request(client_closed);
handle_event(client_timeout, [RequestPart], _) ->
prometheus_counter:inc(?CLIENT_TIMEOUT_TOTAL, [RequestPart]),
count_failed_request(client_timeout);
handle_event(bad_request, [{Reason, _}], _) ->
prometheus_counter:inc(?BAD_REQUEST_TOTAL, [Reason]),
count_failed_request(bad_request);
handle_event(elli_startup, _Args, _Config) ->
Labels = elli_prometheus_config:labels(),
Buckets = elli_prometheus_config:duration_buckets(),
UP = [{name, ?UP},
{help, "Elli is up?"}],
RequestCount = metric(?TOTAL, Labels, "request count"),
RequestDuration = metric(?REQUEST_DURATION, [response_type | Labels], Buckets,
" latencies in microseconds"),
RequestHeadersDuration = metric(?REQUEST_HEADERS_DURATION,
Labels, Buckets,
"time spent receiving and parsing headers"),
RequestBodyDuration = metric(?REQUEST_BODY_DURATION,
Labels, Buckets,
"time spent receiving and parsing body"),
RequestUserDuration = metric(?REQUEST_USER_DURATION,
Labels, Buckets,
"time spent in user callback"),
ResponseSendDuration = metric(?RESPONSE_SEND_DURATION,
[response_type | Labels], Buckets,
"time spent sending reply"),
ResponseSize = metric(?RESPONSE_SIZE,
[response_type | Labels],
"total response size"),
ResponseHeaders = metric(?RESPONSE_HEADERS_SIZE,
[response_type | Labels],
"response headers size"),
ResponseBody = metric(?RESPONSE_BODY_SIZE,
[response_type | Labels],
"response body size"),
prometheus_gauge:declare(UP),
prometheus_gauge:set(?UP, 1),
prometheus_counter:declare(RequestCount),
prometheus_histogram:declare(RequestDuration),
prometheus_histogram:declare(RequestHeadersDuration),
prometheus_histogram:declare(RequestBodyDuration),
prometheus_histogram:declare(RequestUserDuration),
prometheus_histogram:declare(ResponseSendDuration),
prometheus_summary:declare(ResponseSize),
prometheus_summary:declare(ResponseHeaders),
prometheus_summary:declare(ResponseBody),
FailedRequestCount = metric(?FAILED_TOTAL,
[reason], [],
"failed total count."),
BadRequestTotal = metric(?BAD_REQUEST_TOTAL,
[reason], [],
"\"bad_request\" errors count"),
ClientClosedTotal = metric(?CLIENT_CLOSED_TOTAL,
[request_part], [],
"\"client_closed\" errors count"),
ClientTimeoutTotal = metric(?CLIENT_TIMEOUT_TOTAL,
[request_part], [],
"\"client_timeout\" errors count"),
prometheus_counter:declare(FailedRequestCount),
prometheus_counter:declare(BadRequestTotal),
prometheus_counter:declare(ClientClosedTotal),
prometheus_counter:declare(ClientTimeoutTotal),
Registry = default,
ScrapeDuration = [{name, ?SCRAPE_DURATION},
{help, "Scrape duration"},
{labels, ["registry", "content_type"]},
{registry, Registry}],
ScrapeSize = [{name, ?SCRAPE_SIZE},
{help, "Scrape size, not encoded"},
{labels, ["registry", "content_type"]},
{registry, Registry}],
ScrapeEncodedSize = [{name, ?SCRAPE_ENCODED_SIZE},
{help, "Scrape size, encoded"},
{labels, ["registry", "content_type", "encoding"]},
{registry, Registry}],
prometheus_summary:declare(ScrapeDuration),
prometheus_summary:declare(ScrapeSize),
prometheus_summary:declare(ScrapeEncodedSize),
ok;
handle_event(_Event, _Args, _Config) -> ok.
%%%===================================================================
%%% Private functions
%%%===================================================================
handle_full_response(Type, [Req, Code, _Hs, _B, {Timings, Sizes}], _Config) ->
Path = elli_prometheus_config:path(),
case {elli_request:method(Req), elli_request:raw_path(Req)} of
{'GET', Path} -> ok;
_ ->
Labels = labels(Req, Code),
TypedLabels = case Type of
request_complete -> ["full" | Labels];
chunk_complete -> ["chunks" | Labels] %;
%% _ -> Labels
end,
prometheus_counter:inc(?TOTAL, Labels),
prometheus_histogram:observe(?REQUEST_DURATION, TypedLabels,
duration(Timings, request)),
prometheus_histogram:observe(?REQUEST_HEADERS_DURATION, Labels,
duration(Timings, headers)),
prometheus_histogram:observe(?REQUEST_BODY_DURATION, Labels,
duration(Timings, body)),
prometheus_histogram:observe(?REQUEST_USER_DURATION, Labels,
duration(Timings, user)),
prometheus_histogram:observe(?RESPONSE_SEND_DURATION, TypedLabels,
duration(Timings, send)),
prometheus_summary:observe(?RESPONSE_SIZE, TypedLabels,
size(Sizes, response)),
prometheus_summary:observe(?RESPONSE_HEADERS_SIZE, TypedLabels,
size(Sizes, response_headers)),
prometheus_summary:observe(?RESPONSE_BODY_SIZE, TypedLabels,
size(Sizes, response_body)),
ok
end.
count_failed_request(Reason) ->
prometheus_counter:inc(?FAILED_TOTAL, [Reason]).
format_metrics(Req) ->
case negotiate_format(Req) of
undefined ->
throw({406, [], <<>>});
Format ->
{ContentType, Scrape} = render_format(Format),
case negotiate_encoding(Req) of
undefined ->
throw({406, [], <<>>});
Encoding ->
encode_format(ContentType, Encoding, Scrape)
end
end.
negotiate_format(Req) ->
case elli_prometheus_config:format() of
auto ->
Accept = elli_request:get_header(<<"Accept">>, Req, "text/plain"),
Alternatives = elli_prometheus_config:allowed_formats(),
accept_header:negotiate(Accept, Alternatives);
undefined -> undefined;
Format0 -> Format0
end.
negotiate_encoding(Req) ->
AcceptEncoding = elli_request:get_header(
<<"Accept-Encoding">>, Req, ""),
accept_encoding_header:negotiate(AcceptEncoding, [<<"gzip">>,
<<"deflate">>,
<<"identity">>]).
render_format(Format) ->
Registry = default,
ContentType = Format:content_type(),
Scrape = prometheus_summary:observe_duration(
Registry,
?SCRAPE_DURATION,
[Registry, ContentType],
fun () -> Format:format(Registry) end),
prometheus_summary:observe(Registry,
?SCRAPE_SIZE,
[Registry, ContentType],
iolist_size(Scrape)),
{ContentType, Scrape}.
encode_format(ContentType, Encoding, Scrape) ->
Encoded = encode_format_(Encoding, Scrape),
Registry = default,
prometheus_summary:observe(Registry,
?SCRAPE_ENCODED_SIZE,
[Registry, ContentType, Encoding],
iolist_size(Encoded)),
{ok, [{<<"Content-Type">>, ContentType},
{<<"Content-Encoding">>, Encoding}], Encoded}.
encode_format_(<<"gzip">>, Scrape) ->
zlib:gzip(Scrape);
encode_format_(<<"deflate">>, Scrape) ->
ZStream = zlib:open(),
zlib:deflateInit(ZStream),
try
zlib:deflate(ZStream, Scrape, finish)
after
zlib:deflateEnd(ZStream)
end;
encode_format_(<<"identity">>, Scrape) ->
Scrape.
duration(Timings, request) ->
duration(request_start, request_end, Timings);
duration(Timings, headers) ->
duration(headers_start, headers_end, Timings);
duration(Timings, body) ->
duration(body_start, body_end, Timings);
duration(Timings, user) ->
duration(user_start, user_end, Timings);
duration(Timings, send) ->
duration(send_start, send_end, Timings).
duration(StartKey, EndKey, Timings) ->
Start = proplists:get_value(StartKey, Timings),
End = proplists:get_value(EndKey, Timings),
End - Start.
size(Sizes, response) ->
size(Sizes, response_headers) +
size(Sizes, response_body);
size(Sizes, response_headers) ->
proplists:get_value(resp_headers, Sizes);
size(Sizes, response_body) ->
case proplists:get_value(chunks, Sizes) of
undefined ->
case proplists:get_value(file, Sizes) of
undefined ->
proplists:get_value(resp_body, Sizes);
FileSize -> FileSize
end;
ChunksSize -> ChunksSize
end.
metric(Name, Labels, Desc) -> metric(Name, Labels, [], Desc).
metric(Name, Labels, Buckets, Desc) ->
[{name, Name},
{labels, Labels},
{help, "HTTP request " ++ Desc},
{buckets, Buckets}].
labels(Req, StatusCode) ->
Labels = elli_prometheus_config:labels(),
[label(Label, Req, StatusCode) || Label <- Labels].
label(method, Req, _) -> elli_request:method(Req);
label(handler, Req, _) ->
case elli_request:path(Req) of
[H|_] -> H;
[] -> ""
end;
label(status_code, _, StatusCode) -> StatusCode;
label(status_class, _, StatusCode) -> prometheus_http:status_class(StatusCode).
%% request_start
%% headers_start
%% headers_end
%% body_start
%% body_end
%% user_start
%% user_end
%% send_start
%% send_end
%% request_end
%% resp_headers
%% resp_body
%% file
%% chunk
%% exclusive event
%% `request_closed' is sent if the client closes the connection when
%% Elli is waiting for the next request on a keep alive connection.
%%
%% `request_timeout' is sent if the client times out when
%% Elli is waiting for the request.
%%
%% `request_parse_error' fires if the request is invalid and cannot be parsed by
%% [`erlang:decode_packet/3`][decode_packet/3] or it contains a path Elli cannot
%% parse or does not support.
%% `client_closed' can be sent from multiple parts of the request
%% handling. It's sent when the client closes the connection or if for
%% any reason the socket is closed unexpectedly. The `Where' atom
%% tells you in which part of the request processing the closed socket
%% was detected: `receiving_headers', `receiving_body' or `before_response'.
%%
%% `client_timeout' can as with `client_closed' be sent from multiple
%% parts of the request handling. If Elli tries to receive data from
%% the client socket and does not receive anything within a timeout,
%% this event fires and the socket is closed.
%%
%% `bad_request' is sent when Elli detects a request is not well
%% formatted or does not conform to the configured limits. Currently
%% the `Reason' variable can be `{too_many_headers, Headers}'
%% or `{body_size, ContentLength}'.