Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prober metrics collection #222

Merged
merged 6 commits into from Aug 16, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 3 additions & 0 deletions prober/build.gradle
Expand Up @@ -25,6 +25,7 @@ dependencies {
compile deps['com.google.dagger:dagger']
compile deps['com.google.flogger:flogger']
compile deps['com.google.guava:guava']
compile deps['com.google.monitoring-client:metrics']
compile deps['io.netty:netty-buffer']
compile deps['io.netty:netty-codec-http']
compile deps['io.netty:netty-codec']
Expand All @@ -41,10 +42,12 @@ dependencies {
runtime deps['com.google.auto.value:auto-value']
runtime deps['io.netty:netty-tcnative-boringssl-static']

testCompile deps['com.google.monitoring-client:contrib']
testCompile deps['com.google.truth:truth']
testCompile deps['junit:junit']
testCompile deps['org.mockito:mockito-core']
testCompile project(':third_party')
testCompile project(path: ':core', configuration: 'testRuntime')

// Include auto-value in compile until nebula-lint understands
// annotationProcessor
Expand Down
Expand Up @@ -21,6 +21,8 @@
import google.registry.monitoring.blackbox.modules.CertificateModule;
import google.registry.monitoring.blackbox.modules.EppModule;
import google.registry.monitoring.blackbox.modules.WebWhoisModule;
import google.registry.util.Clock;
import google.registry.util.SystemClock;
import io.netty.bootstrap.Bootstrap;
import io.netty.channel.Channel;
import io.netty.channel.EventLoopGroup;
Expand Down Expand Up @@ -54,6 +56,13 @@ static SslProvider provideSslProvider() {
return OpenSsl.isAvailable() ? SslProvider.OPENSSL : SslProvider.JDK;
}

/** {@link Provides} one global {@link Clock} shared by each {@link ProbingSequence}. */
@Provides
@Singleton
static Clock provideClock() {
return new SystemClock();
}

/** {@link Provides} one global {@link EventLoopGroup} shared by each {@link ProbingSequence}. */
@Provides
@Singleton
Expand Down
Expand Up @@ -16,9 +16,12 @@

import com.google.common.flogger.FluentLogger;
import google.registry.monitoring.blackbox.connection.ProbingAction;
import google.registry.monitoring.blackbox.exceptions.FailureException;
import google.registry.monitoring.blackbox.exceptions.UnrecoverableStateException;
import google.registry.monitoring.blackbox.metrics.MetricsCollector;
import google.registry.monitoring.blackbox.tokens.Token;
import google.registry.util.CircularList;
import google.registry.util.Clock;
import io.netty.bootstrap.Bootstrap;
import io.netty.channel.AbstractChannel;
import io.netty.channel.ChannelFuture;
Expand All @@ -45,6 +48,12 @@ public class ProbingSequence extends CircularList<ProbingStep> {

private static final FluentLogger logger = FluentLogger.forEnclosingClass();

/** Shared {@link MetricsCollector} used to record metrics on any step performed. */
private MetricsCollector metrics;

/** Shared {@link Clock} used to record latency on any step performed. */
private Clock clock;

/** Each {@link ProbingSequence} requires a start token to begin running. */
private Token startToken;

Expand All @@ -57,9 +66,16 @@ public class ProbingSequence extends CircularList<ProbingStep> {
/** {@link ProbingSequence} object that represents first step in the sequence. */
private ProbingSequence first;

/** Standard constructor for {@link ProbingSequence} in the list that assigns value and token. */
private ProbingSequence(ProbingStep value, Token startToken) {
/**
* Standard constructor for first {@link ProbingSequence} in the list that assigns value and
* token.
*/
private ProbingSequence(
ProbingStep value, MetricsCollector metrics, Clock clock, Token startToken) {

super(value);
this.metrics = metrics;
this.clock = clock;
this.startToken = startToken;
}

Expand Down Expand Up @@ -95,6 +111,8 @@ public void start() {
* get().generateAction}.
*/
private void runStep(Token token) {
long start = clock.nowUtc().getMillis();

ProbingAction currentAction;
ChannelFuture future;

Expand All @@ -108,12 +126,28 @@ private void runStep(Token token) {
} catch (UnrecoverableStateException e) {
// On an UnrecoverableStateException, terminate the sequence.
logger.atSevere().withCause(e).log("Unrecoverable error in generating or calling action.");

// Records gathered metrics.
metrics.recordResult(
get().protocol().name(),
get().messageTemplate().name(),
get().messageTemplate().responseName(),
MetricsCollector.ResponseType.ERROR,
clock.nowUtc().getMillis() - start);
return;

} catch (Exception e) {
// On any other type of error, restart the sequence at the very first step.
logger.atWarning().withCause(e).log("Error in generating or calling action.");

// Records gathered metrics.
metrics.recordResult(
get().protocol().name(),
get().messageTemplate().name(),
get().messageTemplate().responseName(),
MetricsCollector.ResponseType.ERROR,
clock.nowUtc().getMillis() - start);

// Restart the sequence at the very first step.
restartSequence();
return;
Expand All @@ -125,10 +159,34 @@ private void runStep(Token token) {
// On a successful result, we log as a successful step, and note a success.
logger.atInfo().log(String.format("Successfully completed Probing Step: %s", this));

// Records gathered metrics.
metrics.recordResult(
get().protocol().name(),
get().messageTemplate().name(),
get().messageTemplate().responseName(),
MetricsCollector.ResponseType.SUCCESS,
clock.nowUtc().getMillis() - start);
} else {
// On a failed result, we log the failure and note either a failure or error.
logger.atSevere().withCause(f.cause()).log("Did not result in future success");

// Records gathered metrics as either FAILURE or ERROR depending on future's cause.
if (f.cause() instanceof FailureException) {
metrics.recordResult(
get().protocol().name(),
get().messageTemplate().name(),
get().messageTemplate().responseName(),
MetricsCollector.ResponseType.FAILURE,
clock.nowUtc().getMillis() - start);
} else {
metrics.recordResult(
get().protocol().name(),
get().messageTemplate().name(),
get().messageTemplate().responseName(),
MetricsCollector.ResponseType.ERROR,
clock.nowUtc().getMillis() - start);
}

// If not unrecoverable, we restart the sequence.
if (!(f.cause() instanceof UnrecoverableStateException)) {
restartSequence();
Expand Down Expand Up @@ -181,12 +239,18 @@ public static class Builder extends CircularList.AbstractBuilder<ProbingStep, Pr

private Token startToken;

private MetricsCollector metrics;

private Clock clock;

/**
* This Builder must also be supplied with a {@link Token} to construct a {@link
* ProbingSequence}.
*/
public Builder(Token startToken) {
public Builder(Token startToken, MetricsCollector metrics, Clock clock) {
this.startToken = startToken;
this.metrics = metrics;
this.clock = clock;
}

/** We take special note of the first repeated step. */
Expand All @@ -205,7 +269,7 @@ public Builder add(ProbingStep value) {

@Override
protected ProbingSequence create(ProbingStep value) {
return new ProbingSequence(value, startToken);
return new ProbingSequence(value, metrics, clock, startToken);
}

/**
Expand Down
Expand Up @@ -44,6 +44,12 @@
*/
public class EppRequestMessage extends EppMessage implements OutboundMessageType {

/**
* String that describes the type of EppRequestMessage: hello, login, create, check, delete,
* logout.
*/
private String name;

/** Corresponding {@link EppResponseMessage} that we expect to receive on a successful request. */
private EppResponseMessage expectedResponse;

Expand All @@ -60,10 +66,12 @@ public class EppRequestMessage extends EppMessage implements OutboundMessageType
* Private constructor for {@link EppRequestMessage} that its subclasses use for instantiation.
*/
public EppRequestMessage(
String name,
EppResponseMessage expectedResponse,
String template,
BiFunction<String, String, Map<String, String>> getReplacements) {

this.name = name;
this.expectedResponse = expectedResponse;
this.template = template;
this.getReplacements = getReplacements;
Expand Down Expand Up @@ -125,8 +133,18 @@ public ByteBuf bytes() throws EppClientException {
return buf;
}

/** */
/** Returns the {@link EppResponseMessage} we expect. */
public EppResponseMessage getExpectedResponse() {
return expectedResponse;
}

@Override
public String name() {
return name;
}

@Override
public String responseName() {
return expectedResponse.name();
}
}
Expand Up @@ -79,7 +79,12 @@ public HttpRequestMessage modifyMessage(String... args) throws IllegalArgumentEx
}

@Override
public String toString() {
public String name() {
return String.format("Http(s) Request on: %s", headers().get("host"));
}

@Override
public String responseName() {
return "Http Response";
}
}
Expand Up @@ -30,8 +30,15 @@ public interface OutboundMessageType {

/**
* Necessary to inform metrics collector what kind of message is sent down {@link
* io.netty.channel.ChannelPipeline}
* io.netty.channel.ChannelPipeline}. Not equivalent to toString, as to different instances will
* have the same name if they perform the same action.
*/
@Override
String toString();
String name();

/**
* Necessary to inform metrics collector what kind of message is sent inbound {@link
* io.netty.channel.ChannelPipeline}. Equivalent to {@code name} but for {@link
* InboundMessageType}.
*/
String responseName();
}
@@ -0,0 +1,92 @@
// Copyright 2019 The Nomulus Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package google.registry.monitoring.blackbox.metrics;

import com.google.common.collect.ImmutableSet;
import com.google.monitoring.metrics.EventMetric;
import com.google.monitoring.metrics.ExponentialFitter;
import com.google.monitoring.metrics.IncrementableMetric;
import com.google.monitoring.metrics.LabelDescriptor;
import com.google.monitoring.metrics.MetricRegistryImpl;
import google.registry.util.NonFinalForTesting;
import javax.inject.Inject;
import javax.inject.Singleton;

/** Metrics collection instrumentation. */
@Singleton
public class MetricsCollector {

/** Three standard Response types to be recorded as metrics: SUCCESS, FAILURE, or ERROR. */
public enum ResponseType {
SUCCESS,
FAILURE,
ERROR
}

// Maximum 1 hour latency, this is not specified by the spec, but given we have a one hour idle
// timeout, it seems reasonable that maximum latency is set to 1 hour as well. If we are
// approaching anywhere near 1 hour latency, we'd be way out of SLO anyway.
private static final ExponentialFitter DEFAULT_LATENCY_FITTER =
ExponentialFitter.create(22, 2, 1.0);

private static final ImmutableSet<LabelDescriptor> LABELS =
ImmutableSet.of(
LabelDescriptor.create("protocol", "Name of the protocol."),
LabelDescriptor.create("request", "Name of outbound request"),
LabelDescriptor.create("response", "Name of inbound response"),
LabelDescriptor.create("responseType", "Status of action performed"));

static final IncrementableMetric responsesCounter =
MetricRegistryImpl.getDefault()
.newIncrementableMetric(
"/prober/responses",
"Total number of responses received by the prober.",
"Responses",
LABELS);

static final EventMetric latencyMs =
MetricRegistryImpl.getDefault()
.newEventMetric(
"/prober/latency_specific_ms",
"Round-trip time between a request sent and its corresponding response received.",
"Latency Milliseconds",
LABELS,
DEFAULT_LATENCY_FITTER);

@Inject
MetricsCollector() {}

/**
* Resets all backend metrics.
*
* <p>This should only used in tests to clear out states. No production code should call this
* function.
*/
void resetMetric() {
responsesCounter.reset();
latencyMs.reset();
}

@NonFinalForTesting
public void recordResult(
String protocolName,
String requestName,
String responseName,
ResponseType status,
long latency) {
latencyMs.record(latency, protocolName, requestName, responseName, status.name());
responsesCounter.increment(protocolName, requestName, responseName, status.name());
}
}