From b3df4be577af79d93f39e099e5e0b226ab7fd775 Mon Sep 17 00:00:00 2001 From: Stephan Bosch Date: Thu, 2 Feb 2017 01:36:50 +0100 Subject: [PATCH] lib-http: client: Fixed peer reconnection failure handling. The addressed problem occurs in a very specific situation in which the original successful connection is dropped, yet a new connection fails. It manifests as an assertion failure or panic: Panic: file ioloop-epoll.c: line 189 (io_loop_handler_run_internal): assertion failed: (msecs >= 0) Panic: BUG: No IOs or timeouts set. Not waiting for infinity. The timing is very critical. However, this doesn't mean that the occurrence of this problem is very unlikely; it can happen frequently under high load. --- src/lib-http/http-client-queue.c | 67 ++++++----- src/lib-http/test-http-client-errors.c | 152 +++++++++++++++++++++++++ 2 files changed, 189 insertions(+), 30 deletions(-) diff --git a/src/lib-http/http-client-queue.c b/src/lib-http/http-client-queue.c index bf1abb538d..ba3e668d5f 100644 --- a/src/lib-http/http-client-queue.c +++ b/src/lib-http/http-client-queue.c @@ -478,18 +478,6 @@ http_client_queue_connection_failure(struct http_client_queue *queue, struct http_client_peer *failed_peer; struct http_client_peer *const *peer_idx; - if (queue->cur_peer != NULL) { - /* The peer still has some working connections, which means that - pending requests wait until they're picked up by those connections - or the remaining connections fail as well. In the latter case, - connecting to different peer can resolve the situation, but only - if there is more than one IP. In any other case, the requests will - eventually fail. In the future we could start connections to the next - IP at this point already, but that is no small change. */ - i_assert(array_count(&queue->pending_peers) == 0); - return; - } - http_client_queue_debug(queue, "Failed to set up connection to %s%s: %s " "(%u peers pending, %u requests pending)", @@ -499,25 +487,44 @@ http_client_queue_connection_failure(struct http_client_queue *queue, reason, array_count(&queue->pending_peers), array_count(&queue->requests)); - /* we're still doing the initial connections to this hport. if - we're also doing parallel connections with soft timeouts - (pending_peer_count>1), wait for them to finish - first. */ - failed_peer = NULL; - array_foreach(&queue->pending_peers, peer_idx) { - if (http_client_peer_addr_cmp(&(*peer_idx)->addr, addr) == 0) { - failed_peer = *peer_idx; - array_delete(&queue->pending_peers, - array_foreach_idx(&queue->pending_peers, peer_idx), 1); - break; + if (queue->cur_peer != NULL) { + if (http_client_peer_is_connected(queue->cur_peer)) { + /* The peer still has some working connections, which means that + pending requests wait until they're picked up by those connections + or the remaining connections fail as well. In the latter case, + connecting to different peer can resolve the situation, but only + if there is more than one IP. In any other case, the requests will + eventually fail. In the future we could start connections to the next + IP at this point already, but that is no small change. */ + i_assert(array_count(&queue->pending_peers) == 0); + return; + } + + failed_peer = queue->cur_peer; + http_client_peer_unlink_queue(queue->cur_peer, queue); + queue->cur_peer = NULL; + + } else { + /* we're still doing the initial connections to this hport. if + we're also doing parallel connections with soft timeouts + (pending_peer_count>1), wait for them to finish + first. */ + failed_peer = NULL; + array_foreach(&queue->pending_peers, peer_idx) { + if (http_client_peer_addr_cmp(&(*peer_idx)->addr, addr) == 0) { + failed_peer = *peer_idx; + array_delete(&queue->pending_peers, + array_foreach_idx(&queue->pending_peers, peer_idx), 1); + break; + } + } + i_assert(failed_peer != NULL); + if (array_count(&queue->pending_peers) > 0) { + http_client_queue_debug(queue, + "Waiting for remaining pending peers."); + http_client_peer_unlink_queue(failed_peer, queue); + return; } - } - i_assert(failed_peer != NULL); - if (array_count(&queue->pending_peers) > 0) { - http_client_queue_debug(queue, - "Waiting for remaining pending peers."); - http_client_peer_unlink_queue(failed_peer, queue); - return; } /* one of the connections failed. if we're not using soft timeouts, diff --git a/src/lib-http/test-http-client-errors.c b/src/lib-http/test-http-client-errors.c index bae89ba136..59c6ebb892 100644 --- a/src/lib-http/test-http-client-errors.c +++ b/src/lib-http/test-http-client-errors.c @@ -2453,6 +2453,157 @@ static void test_peer_reuse_failure(void) test_end(); } +/* + * Reconnect failure + */ + +/* dns */ + +static void +test_dns_reconnect_failure_input(struct server_connection *conn) +{ + static unsigned int count = 0; + const char *line; + + while ((line=i_stream_read_next_line(conn->conn.input)) != NULL) { + if (debug) + i_debug("DNS REQUEST %u: %s", count, line); + + if (count == 0) { + o_stream_nsend_str(conn->conn.output, + "0 1\n127.0.0.1\n"); + } else { + o_stream_nsend_str(conn->conn.output, + t_strdup_printf("%d\n", EAI_FAIL)); + if (count > 4) { + server_connection_deinit(&conn); + return; + } + } + count++; + } +} + +static void test_dns_reconnect_failure(void) +{ + test_server_input = test_dns_reconnect_failure_input; + test_server_run(0); +} +/* server */ + +static void +test_reconnect_failure_input(struct server_connection *conn) +{ + static const char *resp = + "HTTP/1.1 200 OK\r\n" + "Content-Length: 18\r\n" + "\r\n" + "Everything is OK\r\n"; + + o_stream_nsend_str(conn->conn.output, resp); + i_close_fd(&fd_listen); + sleep(500); +} + +static void test_server_reconnect_failure(unsigned int index) +{ + test_server_input = test_reconnect_failure_input; + test_server_run(index); +} + +/* client */ + +struct _reconnect_failure_ctx { + struct timeout *to; +}; + +static void +test_client_reconnect_failure_response2( + const struct http_response *resp, + struct _reconnect_failure_ctx *ctx) +{ + if (debug) + i_debug("RESPONSE: %u %s", resp->status, resp->reason); + + test_assert(resp->status == 9002); + test_assert(resp->reason != NULL && *resp->reason != '\0'); + + io_loop_stop(ioloop); + i_free(ctx); +} + +static void +test_client_reconnect_failure_next( + struct _reconnect_failure_ctx *ctx) +{ + struct http_client_request *hreq; + + if (debug) + i_debug("NEXT REQUEST"); + + timeout_remove(&ctx->to); + + hreq = http_client_request(http_client, + "GET", "example.com", "/reconnect-failure-2.txt", + test_client_reconnect_failure_response2, ctx); + http_client_request_set_port(hreq, bind_ports[0]); + http_client_request_submit(hreq); +} + +static void +test_client_reconnect_failure_response1( + const struct http_response *resp, + struct _reconnect_failure_ctx *ctx) +{ + if (debug) + i_debug("RESPONSE: %u %s", resp->status, resp->reason); + + test_assert(resp->status == 200); + test_assert(resp->reason != NULL && *resp->reason != '\0'); + + ctx->to = timeout_add_short(999, + test_client_reconnect_failure_next, ctx); +} + +static bool +test_client_reconnect_failure(const struct http_client_settings *client_set) +{ + struct http_client_request *hreq; + struct _reconnect_failure_ctx *ctx; + + ctx = i_new(struct _reconnect_failure_ctx, 1); + + http_client = http_client_init(client_set); + + hreq = http_client_request(http_client, + "GET", "example.com", "/reconnect-failure-1.txt", + test_client_reconnect_failure_response1, ctx); + http_client_request_set_port(hreq, bind_ports[0]); + http_client_request_submit(hreq); + + return TRUE; +} + +/* test */ + +static void test_reconnect_failure(void) +{ + struct http_client_settings http_client_set; + + test_client_defaults(&http_client_set); + http_client_set.dns_client_socket_path = "./dns-test"; + http_client_set.dns_ttl_msecs = 2000; + http_client_set.max_idle_time_msecs = 1000; + http_client_set.max_attempts = 1; + http_client_set.request_timeout_msecs = 1000; + + test_begin("reconnect failure"); + test_run_client_server(&http_client_set, + test_client_reconnect_failure, + test_server_reconnect_failure, 1, + test_dns_reconnect_failure); + test_end(); +} /* * All tests @@ -2482,6 +2633,7 @@ static void (*const test_functions[])(void) = { test_dns_lookup_failure, test_dns_lookup_ttl, test_peer_reuse_failure, + test_reconnect_failure, NULL };