Skip to content

Commit

Permalink
lib-http: client: Fixed peer reconnection failure handling.
Browse files Browse the repository at this point in the history
The addressed problem occurs in a very specific situation in which the original successful connection is dropped, yet a new connection fails.
It manifests as an assertion failure or panic:

Panic: file ioloop-epoll.c: line 189 (io_loop_handler_run_internal): assertion failed: (msecs >= 0)
Panic: BUG: No IOs or timeouts set. Not waiting for infinity.

The timing is very critical. However, this doesn't mean that the occurrence of this problem is very unlikely; it can happen frequently under high load.
  • Loading branch information
stephanbosch authored and GitLab committed Feb 2, 2017
1 parent aa881f8 commit b3df4be
Show file tree
Hide file tree
Showing 2 changed files with 189 additions and 30 deletions.
67 changes: 37 additions & 30 deletions src/lib-http/http-client-queue.c
Expand Up @@ -478,18 +478,6 @@ http_client_queue_connection_failure(struct http_client_queue *queue,
struct http_client_peer *failed_peer;
struct http_client_peer *const *peer_idx;

if (queue->cur_peer != NULL) {
/* The peer still has some working connections, which means that
pending requests wait until they're picked up by those connections
or the remaining connections fail as well. In the latter case,
connecting to different peer can resolve the situation, but only
if there is more than one IP. In any other case, the requests will
eventually fail. In the future we could start connections to the next
IP at this point already, but that is no small change. */
i_assert(array_count(&queue->pending_peers) == 0);
return;
}

http_client_queue_debug(queue,
"Failed to set up connection to %s%s: %s "
"(%u peers pending, %u requests pending)",
Expand All @@ -499,25 +487,44 @@ http_client_queue_connection_failure(struct http_client_queue *queue,
reason, array_count(&queue->pending_peers),
array_count(&queue->requests));

/* we're still doing the initial connections to this hport. if
we're also doing parallel connections with soft timeouts
(pending_peer_count>1), wait for them to finish
first. */
failed_peer = NULL;
array_foreach(&queue->pending_peers, peer_idx) {
if (http_client_peer_addr_cmp(&(*peer_idx)->addr, addr) == 0) {
failed_peer = *peer_idx;
array_delete(&queue->pending_peers,
array_foreach_idx(&queue->pending_peers, peer_idx), 1);
break;
if (queue->cur_peer != NULL) {
if (http_client_peer_is_connected(queue->cur_peer)) {
/* The peer still has some working connections, which means that
pending requests wait until they're picked up by those connections
or the remaining connections fail as well. In the latter case,
connecting to different peer can resolve the situation, but only
if there is more than one IP. In any other case, the requests will
eventually fail. In the future we could start connections to the next
IP at this point already, but that is no small change. */
i_assert(array_count(&queue->pending_peers) == 0);
return;
}

failed_peer = queue->cur_peer;
http_client_peer_unlink_queue(queue->cur_peer, queue);
queue->cur_peer = NULL;

} else {
/* we're still doing the initial connections to this hport. if
we're also doing parallel connections with soft timeouts
(pending_peer_count>1), wait for them to finish
first. */
failed_peer = NULL;
array_foreach(&queue->pending_peers, peer_idx) {
if (http_client_peer_addr_cmp(&(*peer_idx)->addr, addr) == 0) {
failed_peer = *peer_idx;
array_delete(&queue->pending_peers,
array_foreach_idx(&queue->pending_peers, peer_idx), 1);
break;
}
}
i_assert(failed_peer != NULL);
if (array_count(&queue->pending_peers) > 0) {
http_client_queue_debug(queue,
"Waiting for remaining pending peers.");
http_client_peer_unlink_queue(failed_peer, queue);
return;
}
}
i_assert(failed_peer != NULL);
if (array_count(&queue->pending_peers) > 0) {
http_client_queue_debug(queue,
"Waiting for remaining pending peers.");
http_client_peer_unlink_queue(failed_peer, queue);
return;
}

/* one of the connections failed. if we're not using soft timeouts,
Expand Down
152 changes: 152 additions & 0 deletions src/lib-http/test-http-client-errors.c
Expand Up @@ -2453,6 +2453,157 @@ static void test_peer_reuse_failure(void)
test_end();
}

/*
* Reconnect failure
*/

/* dns */

static void
test_dns_reconnect_failure_input(struct server_connection *conn)
{
static unsigned int count = 0;
const char *line;

while ((line=i_stream_read_next_line(conn->conn.input)) != NULL) {
if (debug)
i_debug("DNS REQUEST %u: %s", count, line);

if (count == 0) {
o_stream_nsend_str(conn->conn.output,
"0 1\n127.0.0.1\n");
} else {
o_stream_nsend_str(conn->conn.output,
t_strdup_printf("%d\n", EAI_FAIL));
if (count > 4) {
server_connection_deinit(&conn);
return;
}
}
count++;
}
}

static void test_dns_reconnect_failure(void)
{
test_server_input = test_dns_reconnect_failure_input;
test_server_run(0);
}
/* server */

static void
test_reconnect_failure_input(struct server_connection *conn)
{
static const char *resp =
"HTTP/1.1 200 OK\r\n"
"Content-Length: 18\r\n"
"\r\n"
"Everything is OK\r\n";

o_stream_nsend_str(conn->conn.output, resp);
i_close_fd(&fd_listen);
sleep(500);
}

static void test_server_reconnect_failure(unsigned int index)
{
test_server_input = test_reconnect_failure_input;
test_server_run(index);
}

/* client */

struct _reconnect_failure_ctx {
struct timeout *to;
};

static void
test_client_reconnect_failure_response2(
const struct http_response *resp,
struct _reconnect_failure_ctx *ctx)
{
if (debug)
i_debug("RESPONSE: %u %s", resp->status, resp->reason);

test_assert(resp->status == 9002);
test_assert(resp->reason != NULL && *resp->reason != '\0');

io_loop_stop(ioloop);
i_free(ctx);
}

static void
test_client_reconnect_failure_next(
struct _reconnect_failure_ctx *ctx)
{
struct http_client_request *hreq;

if (debug)
i_debug("NEXT REQUEST");

timeout_remove(&ctx->to);

hreq = http_client_request(http_client,
"GET", "example.com", "/reconnect-failure-2.txt",
test_client_reconnect_failure_response2, ctx);
http_client_request_set_port(hreq, bind_ports[0]);
http_client_request_submit(hreq);
}

static void
test_client_reconnect_failure_response1(
const struct http_response *resp,
struct _reconnect_failure_ctx *ctx)
{
if (debug)
i_debug("RESPONSE: %u %s", resp->status, resp->reason);

test_assert(resp->status == 200);
test_assert(resp->reason != NULL && *resp->reason != '\0');

ctx->to = timeout_add_short(999,
test_client_reconnect_failure_next, ctx);
}

static bool
test_client_reconnect_failure(const struct http_client_settings *client_set)
{
struct http_client_request *hreq;
struct _reconnect_failure_ctx *ctx;

ctx = i_new(struct _reconnect_failure_ctx, 1);

http_client = http_client_init(client_set);

hreq = http_client_request(http_client,
"GET", "example.com", "/reconnect-failure-1.txt",
test_client_reconnect_failure_response1, ctx);
http_client_request_set_port(hreq, bind_ports[0]);
http_client_request_submit(hreq);

return TRUE;
}

/* test */

static void test_reconnect_failure(void)
{
struct http_client_settings http_client_set;

test_client_defaults(&http_client_set);
http_client_set.dns_client_socket_path = "./dns-test";
http_client_set.dns_ttl_msecs = 2000;
http_client_set.max_idle_time_msecs = 1000;
http_client_set.max_attempts = 1;
http_client_set.request_timeout_msecs = 1000;

test_begin("reconnect failure");
test_run_client_server(&http_client_set,
test_client_reconnect_failure,
test_server_reconnect_failure, 1,
test_dns_reconnect_failure);
test_end();
}

/*
* All tests
Expand Down Expand Up @@ -2482,6 +2633,7 @@ static void (*const test_functions[])(void) = {
test_dns_lookup_failure,
test_dns_lookup_ttl,
test_peer_reuse_failure,
test_reconnect_failure,
NULL
};

Expand Down

0 comments on commit b3df4be

Please sign in to comment.