Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
implement parsing of v6 addresses and rejection of 0-length host and …
…ports.

the v6 parsing works by adding extra states for working with the
[] notation for v6 addresses. hosts and ports cannot be 0-length
because we url parsing from ending when we expect those fields to
begin.

http_parser_parse_url gets a free check for the correctness of
CONNECT urls (they can only be host:port).

this addresses the following issues:

i was bored and had my head in this space.
  • Loading branch information
dgwynne authored and bnoordhuis committed Feb 18, 2012
1 parent 7bc668c commit 8da60bc
Show file tree
Hide file tree
Showing 2 changed files with 152 additions and 36 deletions.
114 changes: 82 additions & 32 deletions http_parser.c
Expand Up @@ -265,7 +265,10 @@ enum state
, s_req_schema
, s_req_schema_slash
, s_req_schema_slash_slash
, s_req_schema_slash_slash_end
, s_req_host_start
, s_req_host_v6_start
, s_req_host_v6
, s_req_host_v6_end
, s_req_host
, s_req_port_start
, s_req_port
Expand Down Expand Up @@ -354,6 +357,7 @@ enum header_states
#define IS_ALPHA(c) (LOWER(c) >= 'a' && LOWER(c) <= 'z')
#define IS_NUM(c) ((c) >= '0' && (c) <= '9')
#define IS_ALPHANUM(c) (IS_ALPHA(c) || IS_NUM(c))
#define IS_HEX(c) (IS_NUM(c) || (LOWER(c) >= 'a' && LOWER(c) <= 'f'))

#if HTTP_PARSER_STRICT
#define TOKEN(c) (tokens[(unsigned char)c])
Expand Down Expand Up @@ -410,22 +414,22 @@ int http_message_needs_eof(http_parser *parser);
* URL and non-URL states by looking for these.
*/
static enum state
parse_url_char(enum state s, const char ch, int is_connect)
parse_url_char(enum state s, const char ch)
{
assert(!isspace(ch));

switch (s) {
case s_req_spaces_before_url:
/* Proxied requests are followed by scheme of an absolute URI (alpha).
* All methods except CONNECT are followed by '/' or '*'.
*/

if (ch == '/' || ch == '*') {
return s_req_path;
}

/* Proxied requests are followed by scheme of an absolute URI (alpha).
* CONNECT is followed by a hostname, which begins with alphanum.
* All other methods are followed by '/' or '*' (handled above).
*/
if (IS_ALPHA(ch) || (is_connect && IS_NUM(ch))) {
return (is_connect) ? s_req_host : s_req_schema;
if (IS_ALPHA(ch)) {
return s_req_schema;
}

break;
Expand All @@ -450,17 +454,29 @@ parse_url_char(enum state s, const char ch, int is_connect)

case s_req_schema_slash_slash:
if (ch == '/') {
return s_req_schema_slash_slash_end;
return s_req_host_start;
}

break;

case s_req_host_start:
if (ch == '[') {
return s_req_host_v6_start;
}

if (IS_HOST_CHAR(ch)) {
return s_req_host;
}

break;

case s_req_schema_slash_slash_end:
case s_req_host:
if (IS_HOST_CHAR(ch)) {
return s_req_host;
}

/* FALLTHROUGH */
case s_req_host_v6_end:
switch (ch) {
case ':':
return s_req_port_start;
Expand All @@ -474,12 +490,19 @@ parse_url_char(enum state s, const char ch, int is_connect)

break;

case s_req_port_start:
case s_req_port:
if (IS_NUM(ch)) {
return s_req_port;
case s_req_host_v6:
if (ch == ']') {
return s_req_host_v6_end;
}

/* FALLTHROUGH */
case s_req_host_v6_start:
if (IS_HEX(ch) || ch == ':') {
return s_req_host_v6;
}
break;

case s_req_port:
switch (ch) {
case '/':
return s_req_path;
Expand All @@ -488,6 +511,12 @@ parse_url_char(enum state s, const char ch, int is_connect)
return s_req_query_string_start;
}

/* FALLTHROUGH */
case s_req_port_start:
if (IS_NUM(ch)) {
return s_req_port;
}

break;

case s_req_path:
Expand Down Expand Up @@ -622,12 +651,15 @@ size_t http_parser_execute (http_parser *parser,
case s_req_schema:
case s_req_schema_slash:
case s_req_schema_slash_slash:
case s_req_schema_slash_slash_end:
case s_req_host_start:
case s_req_host_v6_start:
case s_req_host_v6:
case s_req_host_v6_end:
case s_req_host:
case s_req_port_start:
case s_req_port:
case s_req_query_string_start:
case s_req_query_string:
case s_req_host:
case s_req_fragment_start:
case s_req_fragment:
url_mark = data;
Expand Down Expand Up @@ -975,9 +1007,11 @@ size_t http_parser_execute (http_parser *parser,
if (ch == ' ') break;

MARK(url);
if (parser->method == HTTP_CONNECT) {
parser->state = s_req_host_start;
}

parser->state = parse_url_char(
(enum state)parser->state, ch, parser->method == HTTP_CONNECT);
parser->state = parse_url_char((enum state)parser->state, ch);
if (parser->state == s_dead) {
SET_ERRNO(HPE_INVALID_URL);
goto error;
Expand All @@ -989,6 +1023,10 @@ size_t http_parser_execute (http_parser *parser,
case s_req_schema:
case s_req_schema_slash:
case s_req_schema_slash_slash:
case s_req_host_start:
case s_req_host_v6_start:
case s_req_host_v6:
case s_req_port_start:
{
switch (ch) {
/* No whitespace allowed here */
Expand All @@ -998,8 +1036,7 @@ size_t http_parser_execute (http_parser *parser,
SET_ERRNO(HPE_INVALID_URL);
goto error;
default:
parser->state = parse_url_char(
(enum state)parser->state, ch, parser->method == HTTP_CONNECT);
parser->state = parse_url_char((enum state)parser->state, ch);
if (parser->state == s_dead) {
SET_ERRNO(HPE_INVALID_URL);
goto error;
Expand All @@ -1009,21 +1046,15 @@ size_t http_parser_execute (http_parser *parser,
break;
}

case s_req_schema_slash_slash_end:
case s_req_host:
case s_req_port_start:
case s_req_host_v6_end:
case s_req_port:
case s_req_path:
case s_req_query_string_start:
case s_req_query_string:
case s_req_fragment_start:
case s_req_fragment:
{
/* XXX: There is a bug here where if we're on the first character
* of s_req_host (e.g. our URL is 'http://' and we see a whitespace
* character, we'll consider this a valid URL. This seems incorrect,
* but at least it's bug-compatible with what we had before.
*/
switch (ch) {
case ' ':
parser->state = s_req_http_start;
Expand All @@ -1039,8 +1070,7 @@ size_t http_parser_execute (http_parser *parser,
CALLBACK_DATA(url);
break;
default:
parser->state = parse_url_char(
(enum state)parser->state, ch, parser->method == HTTP_CONNECT);
parser->state = parse_url_char((enum state)parser->state, ch);
if (parser->state == s_dead) {
SET_ERRNO(HPE_INVALID_URL);
goto error;
Expand Down Expand Up @@ -1926,11 +1956,11 @@ http_parser_parse_url(const char *buf, size_t buflen, int is_connect,
enum http_parser_url_fields uf, old_uf;

u->port = u->field_set = 0;
s = s_req_spaces_before_url;
s = is_connect ? s_req_host_start : s_req_spaces_before_url;
uf = old_uf = UF_MAX;

for (p = buf; p < buf + buflen; p++) {
s = parse_url_char(s, *p, is_connect);
s = parse_url_char(s, *p);

/* Figure out the next field that we're operating on */
switch (s) {
Expand All @@ -1940,7 +1970,9 @@ http_parser_parse_url(const char *buf, size_t buflen, int is_connect,
/* Skip delimeters */
case s_req_schema_slash:
case s_req_schema_slash_slash:
case s_req_schema_slash_slash_end:
case s_req_host_start:
case s_req_host_v6_start:
case s_req_host_v6_end:
case s_req_port_start:
case s_req_query_string_start:
case s_req_fragment_start:
Expand All @@ -1951,6 +1983,7 @@ http_parser_parse_url(const char *buf, size_t buflen, int is_connect,
break;

case s_req_host:
case s_req_host_v6:
uf = UF_HOST;
break;

Expand Down Expand Up @@ -1988,6 +2021,23 @@ http_parser_parse_url(const char *buf, size_t buflen, int is_connect,
old_uf = uf;
}

/* CONNECT requests can only contain "hostname:port" */
if (is_connect && u->field_set != ((1 << UF_HOST)|(1 << UF_PORT))) {
return 1;
}

/* Make sure we don't end somewhere unexpected */
switch (s) {
case s_req_host_v6_start:
case s_req_host_v6:
case s_req_host_v6_end:
case s_req_host:
case s_req_port_start:
return 1;
default:
break;
}

if (u->field_set & (1 << UF_PORT)) {
/* Don't bother with endp; we've already validated the string */
unsigned long v = strtoul(buf + u->field_data[UF_PORT].off, NULL, 10);
Expand Down
74 changes: 70 additions & 4 deletions test.c
Expand Up @@ -1948,6 +1948,72 @@ const struct url_test url_tests[] =
}
,.rv=0
}

, {.name="proxy ipv6 request"
,.url="http://[1:2::3:4]/"
,.is_connect=0
,.u=
{.field_set=(1 << UF_SCHEMA) | (1 << UF_HOST) | (1 << UF_PATH)
,.port=0
,.field_data=
{{ 0, 4 } /* UF_SCHEMA */
,{ 8, 8 } /* UF_HOST */
,{ 0, 0 } /* UF_PORT */
,{ 17, 1 } /* UF_PATH */
,{ 0, 0 } /* UF_QUERY */
,{ 0, 0 } /* UF_FRAGMENT */
}
}
,.rv=0
}

, {.name="CONNECT ipv6 address"
,.url="[1:2::3:4]:443"
,.is_connect=1
,.u=
{.field_set=(1 << UF_HOST) | (1 << UF_PORT)
,.port=443
,.field_data=
{{ 0, 0 } /* UF_SCHEMA */
,{ 1, 8 } /* UF_HOST */
,{ 11, 3 } /* UF_PORT */
,{ 0, 0 } /* UF_PATH */
,{ 0, 0 } /* UF_QUERY */
,{ 0, 0 } /* UF_FRAGMENT */
}
}
,.rv=0
}

, {.name="proxy empty host"
,.url="http://:443/"
,.is_connect=1
,.rv=1
}

, {.name="proxy empty port"
,.url="http://hostname:/"
,.is_connect=1
,.rv=1
}

, {.name="CONNECT empty host"
,.url=":443"
,.is_connect=1
,.rv=1
}

, {.name="CONNECT empty port"
,.url="hostname:"
,.is_connect=1
,.rv=1
}

, {.name="CONNECT with extra bits"
,.url="hostname:443/"
,.is_connect=1
,.rv=1
}
};

void
Expand Down Expand Up @@ -1993,8 +2059,8 @@ test_parse_url (void)

if (test->rv == 0) {
if (rv != 0) {
printf("\n*** http_parser_parse_url() \"%s\" test failed, "
"unexpected rv %d ***\n\n", test->name, rv);
printf("\n*** http_parser_parse_url(\"%s\") \"%s\" test failed, "
"unexpected rv %d ***\n\n", test->url, test->name, rv);
exit(1);
}

Expand All @@ -2012,8 +2078,8 @@ test_parse_url (void)
} else {
/* test->rv != 0 */
if (rv == 0) {
printf("\n*** http_parser_parse_url() \"%s\" test failed, "
"unexpected rv %d ***\n\n", test->name, rv);
printf("\n*** http_parser_parse_url(\"%s\") \"%s\" test failed, "
"unexpected rv %d ***\n\n", test->url, test->name, rv);
exit(1);
}
}
Expand Down

0 comments on commit 8da60bc

Please sign in to comment.