Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adjust the DNS lookup duration metric #93254

Merged
merged 5 commits into from
Oct 12, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 30 additions & 15 deletions src/libraries/System.Net.NameResolution/src/System/Net/Dns.cs
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,13 @@ public static string GetHostName()
{
name = NameResolutionPal.GetHostName();
}
catch when (LogFailure(string.Empty, startingTimestamp))
catch (Exception ex) when (LogFailure(string.Empty, startingTimestamp, ex))
{
Debug.Fail("LogFailure should return false");
throw;
}

NameResolutionTelemetry.Log.AfterResolution(string.Empty, startingTimestamp, successful: true);
NameResolutionTelemetry.Log.AfterResolution(string.Empty, startingTimestamp, errorType: null);

if (NetEventSource.Log.IsEnabled()) NetEventSource.Info(null, name);
return name;
Expand Down Expand Up @@ -394,13 +394,13 @@ private static object GetHostEntryOrAddressesCore(string hostName, bool justAddr
Aliases = aliases
};
}
catch when (LogFailure(hostName, startingTimestamp))
catch (Exception ex) when (LogFailure(hostName, startingTimestamp, ex))
{
Debug.Fail("LogFailure should return false");
throw;
}

NameResolutionTelemetry.Log.AfterResolution(hostName, startingTimestamp, successful: true);
NameResolutionTelemetry.Log.AfterResolution(hostName, startingTimestamp, errorType: null);

return result;
}
Expand Down Expand Up @@ -434,13 +434,13 @@ private static object GetHostEntryOrAddressesCore(IPAddress address, bool justAd
}
Debug.Assert(name != null);
}
catch when (LogFailure(address, startingTimestamp))
catch (Exception ex) when (LogFailure(address, startingTimestamp, ex))
{
Debug.Fail("LogFailure should return false");
throw;
}

NameResolutionTelemetry.Log.AfterResolution(address, startingTimestamp, successful: true);
NameResolutionTelemetry.Log.AfterResolution(address, startingTimestamp, errorType: null);

// Do the forward lookup to get the IPs for that host name
startingTimestamp = NameResolutionTelemetry.Log.BeforeResolution(name);
Expand All @@ -464,13 +464,13 @@ private static object GetHostEntryOrAddressesCore(IPAddress address, bool justAd
AddressList = addresses
};
}
catch when (LogFailure(name, startingTimestamp))
catch (Exception ex) when (LogFailure(name, startingTimestamp, ex))
{
Debug.Fail("LogFailure should return false");
throw;
}

NameResolutionTelemetry.Log.AfterResolution(name, startingTimestamp, successful: true);
NameResolutionTelemetry.Log.AfterResolution(name, startingTimestamp, errorType: null);

// One of three things happened:
// 1. Success.
Expand Down Expand Up @@ -577,7 +577,7 @@ private static Task GetHostEntryOrAddressesCoreAsync(string hostName, bool justR
}

private static Task<T>? GetAddrInfoWithTelemetryAsync<T>(string hostName, bool justAddresses, AddressFamily addressFamily, CancellationToken cancellationToken)
where T : class
where T : class
antonfirsov marked this conversation as resolved.
Show resolved Hide resolved
{
long startingTimestamp = Stopwatch.GetTimestamp();
Task? task = NameResolutionPal.GetAddrInfoAsync(hostName, justAddresses, addressFamily, cancellationToken);
Expand All @@ -594,19 +594,33 @@ private static Task GetHostEntryOrAddressesCoreAsync(string hostName, bool justR
static async Task<T> CompleteAsync(Task task, string hostName, long startingTimestamp)
{
_ = NameResolutionTelemetry.Log.BeforeResolution(hostName);
T? result = null;
string? errorType = null;
try
{
result = await ((Task<T>)task).ConfigureAwait(false);
return result;
return await ((Task<T>)task).ConfigureAwait(false);
}
catch (Exception ex)
{
errorType = GetErrorType(ex);
throw;
}
finally
{
NameResolutionTelemetry.Log.AfterResolution(hostName, startingTimestamp, successful: result is not null);
NameResolutionTelemetry.Log.AfterResolution(hostName, startingTimestamp, errorType);
}
}
}

private static string GetErrorType(Exception exception) => (exception as SocketException)?.SocketErrorCode switch
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These strings are specifically for metrics, right? Please add a comment on the method to that effect. Where do these result strings come from? Are they strings dictated by otel, or are you just snake_casing the names from the enum?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These strings are specifically for metrics, right?

Yes.

Where do these result strings come from?

They are defined by us, by mapping enum values to snake-cased strings which is an OTel convention. We do the same in HTTP metrics.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

which is an OTel convention

Why don't we do the same for the _ => case then? i.e. it seems like our general goal here is to take any SocketError value or Exception name, and snake case it, and we're then just optimizing a few cases that we expect to be most common by hardcoding the transformation. In the name of robustness, shouldn't we be dynamically creating the snak_cased version of the resulting SocketError if there is one (outside of the range we've hardcoded) or for the exception type?

Copy link
Member Author

@antonfirsov antonfirsov Oct 10, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As noticed in #93254 (comment), this should be actually the full name of the exception, I will correct it.

Why don't we do the same for the _ => case then?

The specs don't require us to camel-case the exception names (would be very weird with full names). Tagging @lmolkova for a comprehensive explanation.

dynamically creating the snak_cased version of the resulting SocketError

The SocketError-s that can practically appear here are coming from a limited set. I picked the codes we can realistically get from the PAL by looking at our mapping in the Unix PAL, and getaddrinfo return values on Windows.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And when that mapping is augmented in the future, we're going to remember to update this random method as well?

Copy link
Member Author

@antonfirsov antonfirsov Oct 10, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And when that mapping is augmented in the future

That would only happen if getaddrinfo implementations would introduce new error codes which I find very unlikely. Implementing dynamic mapping here to prematurely solve that problem feels like overengineering. Users would see the value host_not_found in 99+% of the practical cases, sometimes try_again maybe.

If you think the mapping is problematic in the form I'm proposing, I would rather prefer to remove it, and just report the exception name.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it seems like our general goal here is to take any SocketError value or Exception name, and snake case it

I'd frame the goal as disambiguating the most common sources of error that users would appreciate not having grouped together when analyzing failures. I don't think it is a goal that every possible SocketError has been disambiguated, that new future SocketError values are automatically disambiguated, or that exception names are transformed into snake case. The hard-coded list Anton has here seems fine to me. If we ever changed it we'd probably want that to be an intentional choice so we can warn users that their queries might need to be updated.

{
SocketError.HostNotFound => "host_not_found",
SocketError.TryAgain => "try_again",
SocketError.AddressFamilyNotSupported => "address_family_not_supported",
SocketError.NoRecovery => "no_recovery",

_ => exception.GetType().Name
};
Copy link
Member Author

@antonfirsov antonfirsov Oct 9, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I picked the codes we can realistically get from the PAL by looking at our mapping in the Unix PAL, and getaddrinfo return values on Windows.
There are some other codes that may theoretically occur, but I wouldn't expect to see them in practice. Ie. seeing InvalidArgument , TypeNotFound or SocketNotSupported means a programming error in our PAL rather than anything else if I'm not mistaken.


private static IPHostEntry CreateHostEntryForAddress(IPAddress address) =>
new IPHostEntry
{
Expand All @@ -627,9 +641,10 @@ private static void ValidateHostName(string hostName)
}
}

private static bool LogFailure(object hostNameOrAddress, long? startingTimestamp)
private static bool LogFailure(object hostNameOrAddress, long? startingTimestamp, Exception exception)
{
NameResolutionTelemetry.Log.AfterResolution(hostNameOrAddress, startingTimestamp, successful: false);
string errorType = GetErrorType(exception);
NameResolutionTelemetry.Log.AfterResolution(hostNameOrAddress, startingTimestamp, errorType);
return false;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,25 @@ internal static class NameResolutionMetrics
private static readonly Meter s_meter = new("System.Net.NameResolution");

private static readonly Histogram<double> s_lookupDuration = s_meter.CreateHistogram<double>(
name: "dns.lookups.duration",
name: "dns.lookup.duration",
unit: "s",
description: "Measures the time taken to perform a DNS lookup.");

public static bool IsEnabled() => s_lookupDuration.Enabled;

public static void AfterResolution(TimeSpan duration, string hostName)
public static void AfterResolution(TimeSpan duration, string hostName, string? errorType)
antonfirsov marked this conversation as resolved.
Show resolved Hide resolved
{
s_lookupDuration.Record(duration.TotalSeconds, KeyValuePair.Create("dns.question.name", (object?)hostName));
var hostNameTag = KeyValuePair.Create("dns.question.name", (object?)hostName);

if (errorType is null)
{
s_lookupDuration.Record(duration.TotalSeconds, hostNameTag);
}
else
{
var errorTypeTag = KeyValuePair.Create("error.type", (object?)errorType);
s_lookupDuration.Record(duration.TotalSeconds, hostNameTag, errorTypeTag);
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ public long BeforeResolution(object hostNameOrAddress)
}

[NonEvent]
public void AfterResolution(object hostNameOrAddress, long? startingTimestamp, bool successful)
public void AfterResolution(object hostNameOrAddress, long? startingTimestamp, string? errorType)
{
Debug.Assert(startingTimestamp.HasValue);
if (startingTimestamp == 0)
Expand All @@ -99,7 +99,7 @@ public void AfterResolution(object hostNameOrAddress, long? startingTimestamp, b

if (IsEnabled(EventLevel.Informational, EventKeywords.None))
{
if (!successful)
if (errorType is not null)
{
ResolutionFailed();
}
Expand All @@ -110,7 +110,7 @@ public void AfterResolution(object hostNameOrAddress, long? startingTimestamp, b

if (NameResolutionMetrics.IsEnabled())
{
NameResolutionMetrics.AfterResolution(duration, GetHostnameFromStateObject(hostNameOrAddress));
NameResolutionMetrics.AfterResolution(duration, GetHostnameFromStateObject(hostNameOrAddress), errorType);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ namespace System.Net.NameResolution.Tests
{
public class MetricsTest
{
private const string DnsLookupDuration = "dns.lookups.duration";
private const string DnsLookupDuration = "dns.lookup.duration";

[ConditionalFact(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))]
public static void ResolveValidHostName_MetricsRecorded()
Expand Down Expand Up @@ -57,17 +57,26 @@ public static async Task ResolveInvalidHostName_MetricsRecorded()
Assert.ThrowsAny<SocketException>(() => Dns.EndGetHostEntry(Dns.BeginGetHostEntry(InvalidHostName, null, null)));
Assert.ThrowsAny<SocketException>(() => Dns.EndGetHostAddresses(Dns.BeginGetHostAddresses(InvalidHostName, null, null)));

double[] measurements = GetMeasurementsForHostname(recorder, InvalidHostName);
double[] measurements = GetMeasurementsForHostname(recorder, InvalidHostName, "host_not_found");

Assert.Equal(6, measurements.Length);
Assert.All(measurements, m => Assert.True(m > double.Epsilon));
}

private static double[] GetMeasurementsForHostname(InstrumentRecorder<double> recorder, string hostname)
private static double[] GetMeasurementsForHostname(InstrumentRecorder<double> recorder, string hostname, string? expectedErrorType = null)
{
return recorder
.GetMeasurements()
.Where(m => m.Tags.ToArray().Any(t => t.Key == "dns.question.name" && t.Value is string hostnameTag && hostnameTag == hostname))
.Where(m =>
{
KeyValuePair<string, object?>[] tags = m.Tags.ToArray();
if (!tags.Any(t => t.Key == "dns.question.name" && t.Value is string hostnameTag && hostnameTag == hostname))
{
return false;
}
string? actualErrorType = tags.FirstOrDefault(t => t.Key == "error.type").Value as string;
return expectedErrorType == actualErrorType;
})
.Select(m => m.Value)
.ToArray();
}
Expand Down