Skip to content

Commit

Permalink
feat(BOUN-998): Further improve logging, add more fields
Browse files Browse the repository at this point in the history
  • Loading branch information
blind-oracle committed Jan 19, 2024
1 parent f7deb73 commit 9c2eb38
Show file tree
Hide file tree
Showing 11 changed files with 143 additions and 95 deletions.
81 changes: 43 additions & 38 deletions ic-os/boundary-guestos/rootfs/etc/nginx/nginx.conf
Expand Up @@ -114,46 +114,51 @@ http {
##

log_format access escape=json '{'
'"body_bytes_sent":' '"$body_bytes_sent"' ','
'"bytes_sent":' '"$bytes_sent"' ','
'"content_length":' '"$content_length"' ','
'"content_type":' '"$content_type"' ','
'"geo_city_name":' '"$geo_city_name"' ','
'"geo_country_code":' '"$geo_country_code"' ','
'"geo_country_name":' '"$geo_country_name"' ','
'"hostname":' '"$hostname"' ','
'"http_host":' '"$http_host"' ','
'"http_origin":' '"$http_origin"' ','
'"http_referer":' '"$http_referer"' ','
'"http_user_agent":' '"$http_user_agent"' ','
'"https":' '"$https"' ','
'"ic_canister_id":' '"$inferred_canister_id"' ','
'"ic_canister_id_cbor":' '"$sent_http_x_ic_canister_id"' ','
'"ic_method_name":' '"$sent_http_x_ic_method_name"' ','
'"ic_node_id":' '"$sent_http_x_ic_node_id"' ','
'"ic_request_type":' '"$sent_http_x_ic_request_type"' ','
'"ic_sender":' '"$sent_http_x_ic_sender"' ','
'"ic_subnet_id":' '"$sent_http_x_ic_subnet_id"' ','
'"is_bot":' '"$is_bot"' ','
'"msec":' '"$msec"' ','
'"pre_isolation_canister":' '"$is_pre_isolation_canister"' ','
'"query_string":' '"$query_string"' ','
'"remote_addr":' '"$remote_addr"' ','
'"request_id":' '"$request_uuid"' ','
'"request_length":' '"$request_length"' ','
'"request_method":' '"$request_method"' ','
'"request_time":' '"$request_time"' ','
'"request_uri":' '"$request_uri"' ','
'"server_protocol":' '"$server_protocol"' ','
'"ssl_cipher":' '"$ssl_cipher"' ','
'"ssl_protocol":' '"$ssl_protocol"' ','
'"status":' '"$status"' ','
'"upstream_addr":' '"$upstream_addr"' ','
'"upstream_cache_status":' '"$upstream_cache_status"' ','
'"upstream_status":' '"$upstream_status"'
'"body_bytes_sent":' '"$body_bytes_sent",'
'"bytes_sent":' '"$bytes_sent",'
'"cache_bypass_reason":' '"$sent_http_x_ic_cache_bypass_reason",'
'"cache_status":' '"$sent_http_x_ic_cache_status",'
'"cache_status_nginx":' '"$upstream_cache_status",'
'"content_length":' '"$content_length",'
'"content_type":' '"$content_type",'
'"error_cause":' '"$sent_http_x_ic_error_cause",'
'"geo_city_name":' '"$geo_city_name",'
'"geo_country_code":' '"$geo_country_code",'
'"geo_country_name":' '"$geo_country_name",'
'"hostname":' '"$hostname",'
'"http_host":' '"$http_host",'
'"http_origin":' '"$http_origin",'
'"http_referer":' '"$http_referer",'
'"http_user_agent":' '"$http_user_agent",'
'"https":' '"$https",'
'"ic_canister_id":' '"$inferred_canister_id",'
'"ic_canister_id_cbor":' '"$sent_http_x_ic_canister_id",'
'"ic_method_name":' '"$sent_http_x_ic_method_name",'
'"ic_node_id":' '"$sent_http_x_ic_node_id",'
'"ic_request_type":' '"$sent_http_x_ic_request_type",'
'"ic_sender":' '"$sent_http_x_ic_sender",'
'"ic_subnet_id":' '"$sent_http_x_ic_subnet_id",'
'"is_bot":' '"$is_bot",'
'"msec":' '"$msec",'
'"pre_isolation_canister":' '"$is_pre_isolation_canister",'
'"query_string":' '"$query_string",'
'"remote_addr":' '"$remote_addr",'
'"retries":' '"$sent_http_x_ic_retries",'
'"request_id":' '"$request_uuid",'
'"request_length":' '"$request_length",'
'"request_method":' '"$request_method",'
'"request_time":' '"$request_time",'
'"request_time_headers":' '"$upstream_header_time",'
'"request_uri":' '"$request_uri",'
'"server_protocol":' '"$server_protocol",'
'"ssl_cipher":' '"$ssl_cipher",'
'"ssl_protocol":' '"$ssl_protocol",'
'"status":' '"$status",'
'"status_upstream":' '"$upstream_status",'
'"upstream":' '"$upstream_addr"'
'}';

access_log syslog:server=unix:/dev/log,tag=access,nohostname access;
access_log syslog:server=unix:/var/lib/vector/nginx.socket,tag=access,nohostname access;
error_log syslog:server=unix:/dev/log,tag=error,nohostname warn;

##
Expand Down
Expand Up @@ -13,3 +13,6 @@ net.ipv4.tcp_notsent_lowat = 16384
net.ipv4.tcp_rmem = 4096 131072 134217728
net.ipv4.tcp_wmem = 4096 131072 134217728
net.core.default_qdisc = fq

# Increase Unix domain datagram socket queue size
net.unix.max_dgram_qlen = 8192
Expand Up @@ -19,6 +19,7 @@ ExecStart=/bin/bash -c ' \
--nns-urls "${NNS_URL}" \
--http-port 4444 \
--metrics-addr "[::]:9324" \
--log-failed-requests-only \
--min-ok-count 5 \
--max-height-lag 50 \
--nftables-system-replicas-path /run/ic-node/etc/nftables/system_replicas.ruleset \
Expand Down
Expand Up @@ -3,6 +3,7 @@ Description=Vector
After=network-online.target
Wants=network-online.target
After=setup-vector.service
Before=nginx.service
BindsTo=setup-vector.service

[Service]
Expand All @@ -13,6 +14,8 @@ Environment="VECTOR_PROMETHUS_ADDR=[::]:9317"
Environment="DANTED_PROMETHUS_ADDR=[::]:9323"
EnvironmentFile=/run/ic-node/etc/default/vector
ExecStart=/usr/bin/vector --config /etc/vector/vector.yaml
# Wait a bit for socket files etc to be created before moving forward
ExecStartPost=/usr/bin/sleep 1

[Install]
WantedBy=multi-user.target
32 changes: 16 additions & 16 deletions ic-os/boundary-guestos/rootfs/etc/vector/vector.yaml
Expand Up @@ -5,28 +5,30 @@ sources:
vector_metrics:
type: internal_metrics

nginx_socket:
type: socket

mode: unix_datagram
socket_file_mode: 666
path: /var/lib/vector/nginx.socket

decoding:
codec: syslog

journal:
type: journald
include_units:
- certificate-issuer
- certificate-syncer
- ic-boundary
- nginx
- danted

transforms:
nginx_access:
type: filter
inputs:
- journal

condition: ._SYSTEMD_UNIT == "nginx.service" && .SYSLOG_IDENTIFIER == "access"

nginx_access_parsed:
type: remap
inputs:
- nginx_access
- nginx_socket

source: |
. = parse_json!(.message)
Expand Down Expand Up @@ -152,7 +154,7 @@ sinks:
inputs:
- vector_metrics

address: "${VECTOR_PROMETHUS_ADDR}"
address: "${VECTOR_PROMETHUS_ADDR:?VECTOR_PROMETHUS_ADDR must be provided}"
default_namespace: vector
suppress_timestamp: true

Expand All @@ -171,13 +173,11 @@ sinks:
inputs:
- nginx_access_parsed

# Hardcoded for now, should change to an env var after migration
uri: https://ch-proxy.dfinity.network/vector/nginx_ingest

uri: "${LOGGING_URL:?LOGGING_URL must be provided}"
auth:
strategy: basic
user: "${CLICKHOUSE_USER:?CLICKHOUSE_USER must be provided}"
password: "${CLICKHOUSE_PASSWORD:?CLICKHOUSE_PASSWORD must be provided}"
user: "${LOGGING_USER:?LOGGING_USER must be provided}"
password: "${LOGGING_PASSWORD:?LOGGING_PASSWORD must be provided}"

compression: zstd
encoding:
Expand Down
8 changes: 3 additions & 5 deletions ic-os/boundary-guestos/rootfs/opt/ic/bin/setup-vector.sh
Expand Up @@ -26,7 +26,6 @@ function read_variables() {
while IFS="=" read -r key value; do
case "${key}" in
"env") ENV="${value}" ;;
"ip_hash_salt") IP_HASH_SALT="${value}" ;;
"logging_url") LOGGING_URL="${value}" ;;
"logging_user") LOGGING_USER="${value}" ;;
"logging_password") LOGGING_PASSWORD="${value}" ;;
Expand All @@ -38,10 +37,9 @@ function generate_vector_config() {
mkdir -p "${RUN_DIR}"
cat >"${ENV_FILE}" <<EOF
ENV=${ENV}
IP_HASH_SALT=${IP_HASH_SALT:-}
CLICKHOUSE_URL=${LOGGING_URL:-}
CLICKHOUSE_USER=${LOGGING_USER:-}
CLICKHOUSE_PASSWORD=${LOGGING_PASSWORD:-}
LOGGING_URL=${LOGGING_URL:-}
LOGGING_USER=${LOGGING_USER:-}
LOGGING_PASSWORD=${LOGGING_PASSWORD:-}
EOF
}

Expand Down
3 changes: 3 additions & 0 deletions rs/boundary_node/ic_boundary/src/cli.rs
Expand Up @@ -172,6 +172,9 @@ pub struct MonitoringConfig {
/// Disable per-request logging and metrics recording
#[clap(long)]
pub disable_request_logging: bool,
/// Log only failed (non-2xx status code or other problems) requests
#[clap(long)]
pub log_failed_requests_only: bool,
}

#[derive(Args)]
Expand Down
6 changes: 5 additions & 1 deletion rs/boundary_node/ic_boundary/src/core.rs
Expand Up @@ -465,7 +465,11 @@ pub fn setup_router(
.layer(option_layer(
(!cli.monitoring.disable_request_logging).then_some(
middleware::from_fn_with_state(
HttpMetricParams::new(metrics_registry, "http_request_in"),
HttpMetricParams::new(
metrics_registry,
"http_request_in",
cli.monitoring.log_failed_requests_only,
),
metrics::metrics_middleware,
),
),
Expand Down
70 changes: 38 additions & 32 deletions rs/boundary_node/ic_boundary/src/metrics.rs
Expand Up @@ -487,14 +487,15 @@ impl MetricParamsCheck {
#[derive(Clone)]
pub struct HttpMetricParams {
pub action: String,
pub log_failed_requests_only: bool,
pub counter: IntCounterVec,
pub durationer: HistogramVec,
pub request_sizer: HistogramVec,
pub response_sizer: HistogramVec,
}

impl HttpMetricParams {
pub fn new(registry: &Registry, action: &str) -> Self {
pub fn new(registry: &Registry, action: &str, log_failed_requests_only: bool) -> Self {
const LABELS_HTTP: &[&str] = &[
"request_type",
"status_code",
Expand All @@ -507,6 +508,7 @@ impl HttpMetricParams {

Self {
action: action.to_string(),
log_failed_requests_only,

counter: register_int_counter_vec_with_registry!(
format!("{action}_total"),
Expand Down Expand Up @@ -659,6 +661,7 @@ pub async fn metrics_middleware(

let HttpMetricParams {
action,
log_failed_requests_only,
counter,
durationer,
request_sizer,
Expand All @@ -668,6 +671,7 @@ pub async fn metrics_middleware(
// Closure that gets called when the response body is fully read (or an error occurs)
let record_metrics = move |response_size: u64, body_result: Result<(), String>| {
let full_duration = start_time.elapsed().as_secs_f64();
let failed = error_cause.is_some() || !status_code.is_success();

let (error_cause, error_details) = match &error_cause {
Some(v) => (Some(v.to_string()), v.details()),
Expand Down Expand Up @@ -732,37 +736,39 @@ pub async fn metrics_middleware(
.get(USER_AGENT)
.map(|v| v.to_str().unwrap_or("parsing_error"));

info!(
action,
request_id,
request_type = ctx.request_type.to_string(),
error_cause,
error_details,
status = status_code.as_u16(),
subnet_id,
node_id,
canister_id = canister_id.map(|x| x.to_string()),
canister_id_cbor = ctx.canister_id.map(|x| x.to_string()),
sender,
method_name = ctx.method_name,
proc_duration,
full_duration,
request_size = ctx.request_size,
response_size,
retry_count = &retry_result.as_ref().map(|x| x.retries),
retry_success = &retry_result.map(|x| x.success.to_string()),
body_error = body_result.err(),
%cache_status,
cache_bypass_reason = cache_bypass_reason.map(|x| x.to_string()),
nonce_len = ctx.nonce.clone().map(|x| x.len()),
arg_len = ctx.arg.clone().map(|x| x.len()),
ip_family,
query_string,
header_host,
header_origin,
header_referer,
header_user_agent,
);
if !log_failed_requests_only || failed {
info!(
action,
request_id,
request_type = ctx.request_type.to_string(),
error_cause,
error_details,
status = status_code.as_u16(),
subnet_id,
node_id,
canister_id = canister_id.map(|x| x.to_string()),
canister_id_cbor = ctx.canister_id.map(|x| x.to_string()),
sender,
method_name = ctx.method_name,
proc_duration,
full_duration,
request_size = ctx.request_size,
response_size,
retry_count = &retry_result.as_ref().map(|x| x.retries),
retry_success = &retry_result.map(|x| x.success.to_string()),
body_error = body_result.err(),
%cache_status,
cache_bypass_reason = cache_bypass_reason.map(|x| x.to_string()),
nonce_len = ctx.nonce.clone().map(|x| x.len()),
arg_len = ctx.arg.clone().map(|x| x.len()),
ip_family,
query_string,
header_host,
header_origin,
header_referer,
header_user_agent,
);
}
};

let (parts, body) = response.into_parts();
Expand Down
16 changes: 14 additions & 2 deletions rs/boundary_node/ic_boundary/src/routes.rs
Expand Up @@ -75,6 +75,8 @@ const HEADER_IC_REQUEST_TYPE: HeaderName = HeaderName::from_static("x-ic-request
#[allow(clippy::declare_interior_mutable_const)]
const HEADER_IC_RETRIES: HeaderName = HeaderName::from_static("x-ic-retries");
#[allow(clippy::declare_interior_mutable_const)]
const HEADER_IC_ERROR_CAUSE: HeaderName = HeaderName::from_static("x-ic-error-cause");
#[allow(clippy::declare_interior_mutable_const)]
const HEADER_X_REQUEST_ID: HeaderName = HeaderName::from_static("x-request-id");

const HEADERS_HIDE_HTTP_REQUEST: [&str; 4] =
Expand Down Expand Up @@ -642,14 +644,24 @@ pub async fn lookup_subnet(
pub async fn postprocess_response(request: Request<Body>, next: Next<Body>) -> impl IntoResponse {
let mut response = next.run(request).await;

let error_cause = response
.extensions()
.get::<ErrorCause>()
.map(|x| x.to_string())
.unwrap_or("none".into());

// Set the correct content-type for all replies if it's not an error
let error_cause = response.extensions().get::<ErrorCause>();
if error_cause.is_none() {
if error_cause == "none" && response.status().is_success() {
response
.headers_mut()
.insert(CONTENT_TYPE, CONTENT_TYPE_CBOR);
}

response.headers_mut().insert(
HEADER_IC_ERROR_CAUSE,
HeaderValue::from_maybe_shared(Bytes::from(error_cause)).unwrap(),
);

// Add cache status if there's one
let cache_status = response.extensions().get::<CacheStatus>().cloned();
if let Some(v) = cache_status {
Expand Down

0 comments on commit 9c2eb38

Please sign in to comment.