Skip to content

Commit

Permalink
Preprocess parentheses for the rfc1035 scanner
Browse files Browse the repository at this point in the history
The ragel scanner for rfc1035 is enormously complex, which makes
it difficult to understand and also huge in terms of generated C
and machine code size, and one of the single worst complicating
factors here is handling rfc1035's parentheses for line
continuations.

This patch introduces a preprocessing step which eliminates them
in-place from the input buffer quickly before the ragel code is
ever invoked, while also keeping linecounts correct for error
reporting.  Along the way, it also converts all zonefile
commentary into whitespace (it had to do this within parens, so
may as well do it universally).

Both parentheses and comment handling are thus removed from the
ragel parser, making it considerably smaller and quicker to build.

The generated C source (using ragel -G2) was reduced from ~632KB to
~309KB (~51% savings), and the size of final optimized gdnsd
binary for my compiler was reduced by ~15% as well.

The net slowdown for zone loading (on my test machine with a fast
NVMe drive testing 1000x randomly-named zonefiles sized ~214KB
each) is only about 7% in spite of the fact that the zonefile
buffer gets scanned twice, which seems a fine trade all things
considered!
  • Loading branch information
blblack committed Dec 6, 2018
1 parent adb52a3 commit ef58705
Show file tree
Hide file tree
Showing 4 changed files with 121 additions and 47 deletions.
5 changes: 3 additions & 2 deletions include/gdnsd/file.h
Expand Up @@ -35,16 +35,17 @@ typedef struct gdnsd_fmap_s_ gdnsd_fmap_t;
// will return length 0 and a valid pointer to 1 NUL byte.
// "seq" is an optimization hint: set to true if expected buffer access
// pattern is sequential.
// "mod" gives a writeable private buffer, instead of a readonly shared one
F_NONNULL F_WUNUSED
gdnsd_fmap_t* gdnsd_fmap_new(const char* fn, const bool seq);
gdnsd_fmap_t* gdnsd_fmap_new(const char* fn, const bool seq, const bool mod);

// Get the length of the mapped file data (zero is possible)
F_NONNULL F_PURE
size_t gdnsd_fmap_get_len(const gdnsd_fmap_t* fmap);

// Get the buffer pointer for the mapped file data (always a valid pointer)
F_NONNULL F_RETNN F_PURE
const void* gdnsd_fmap_get_buf(const gdnsd_fmap_t* fmap);
void* gdnsd_fmap_get_buf(const gdnsd_fmap_t* fmap);

// Destructs the fmap_t object, which includes unmap() of the memory
// returned via fmap_get_buf().
Expand Down
8 changes: 5 additions & 3 deletions libgdnsd/file.c
Expand Up @@ -37,7 +37,7 @@ struct gdnsd_fmap_s_ {
size_t len;
};

gdnsd_fmap_t* gdnsd_fmap_new(const char* fn, const bool seq)
gdnsd_fmap_t* gdnsd_fmap_new(const char* fn, const bool seq, const bool mod)
{
const int fd = open(fn, O_RDONLY | O_CLOEXEC);
if (fd < 0) {
Expand All @@ -64,7 +64,9 @@ gdnsd_fmap_t* gdnsd_fmap_new(const char* fn, const bool seq)
char* mapbuf = NULL;

if (len) {
mapbuf = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
const int prot = mod ? (PROT_READ | PROT_WRITE) : PROT_READ;
const int flags = mod ? MAP_PRIVATE : MAP_SHARED;
mapbuf = mmap(NULL, len, prot, flags, fd, 0);
if (mapbuf == MAP_FAILED) {
log_err("Cannot mmap '%s': %s", fn, logf_errno());
close(fd);
Expand Down Expand Up @@ -94,7 +96,7 @@ gdnsd_fmap_t* gdnsd_fmap_new(const char* fn, const bool seq)
return fmap;
}

const void* gdnsd_fmap_get_buf(const gdnsd_fmap_t* fmap)
void* gdnsd_fmap_get_buf(const gdnsd_fmap_t* fmap)
{
gdnsd_assert(fmap->buf);
return fmap->buf;
Expand Down
2 changes: 1 addition & 1 deletion libgdnsd/vscf.rl
Expand Up @@ -820,7 +820,7 @@ GDNSD_DIAG_POP
vscf_data_t* vscf_scan_filename(const char* fn)
{
vscf_data_t* rv = NULL;
gdnsd_fmap_t* fmap = gdnsd_fmap_new(fn, true);
gdnsd_fmap_t* fmap = gdnsd_fmap_new(fn, true, false);
if (fmap) {
const size_t len = gdnsd_fmap_get_len(fmap);
const char* buf = gdnsd_fmap_get_buf(fmap);
Expand Down
153 changes: 112 additions & 41 deletions src/zscan_rfc1035.rl
Expand Up @@ -54,7 +54,6 @@
typedef struct {
uint8_t ipv6[16];
uint32_t ipv4;
bool in_paren;
bool zn_err_detect;
bool lhs_is_ooz;
unsigned lcount;
Expand Down Expand Up @@ -91,7 +90,7 @@ typedef struct {
} zscan_t;

F_NONNULL
static void scanner(zscan_t* z, const char* buf, const size_t bufsize);
static void scanner(zscan_t* z, char* buf, const size_t bufsize);

/******** IP Addresses ********/

Expand Down Expand Up @@ -269,9 +268,9 @@ static void dname_set(zscan_t* z, uint8_t* dname, unsigned len, bool lhs)
// function pointer to eliminate the possibility of
// inlining on non-gcc compilers, I hope) to avoid issues with
// setjmp and all of the local auto variables in zscan_rfc1035() below.
typedef bool (*sij_func_t)(zscan_t*, const char*, const unsigned);
typedef bool (*sij_func_t)(zscan_t*, char*, const unsigned);
F_NONNULL F_NOINLINE
static bool _scan_isolate_jmp(zscan_t* z, const char* buf, const unsigned bufsize)
static bool _scan_isolate_jmp(zscan_t* z, char* buf, const unsigned bufsize)
{
if (!sigsetjmp(z->jbuf, 0)) {
scanner(z, buf, bufsize);
Expand All @@ -287,14 +286,14 @@ static bool zscan_do(zone_t* zone, const uint8_t* origin, const char* fn, const

bool failed = false;

gdnsd_fmap_t* fmap = gdnsd_fmap_new(fn, true);
gdnsd_fmap_t* fmap = gdnsd_fmap_new(fn, true, true);
if (!fmap) {
failed = true;
return failed;
}

const size_t bufsize = gdnsd_fmap_get_len(fmap);
const char* buf = gdnsd_fmap_get_buf(fmap);
char* buf = gdnsd_fmap_get_buf(fmap);

zscan_t* z = xcalloc(sizeof(*z));
z->lcount = 1;
Expand Down Expand Up @@ -711,20 +710,108 @@ static void set_limit_v6(zscan_t* z)
z->limit_v6 = z->uval;
}

F_NONNULL
static void open_paren(zscan_t* z)
// The external entrypoint to the parser
bool zscan_rfc1035(zone_t* zone, const char* fn)
{
if (z->in_paren)
parse_error_noargs("Parenthetical error: double-open");
z->in_paren = true;
gdnsd_assert(zone->dname);
log_debug("rfc1035: Scanning zonefile '%s'", logf_dname(zone->dname));
return zscan_do(zone, zone->dname, fn, gcfg->zones_default_ttl, 0, 0);
}

// This pre-processor does two important things that vastly simplify the real
// ragel parser:
// 1) Gets rid of all comments, replacing their characters with spaces so that
// they're just seen as excessive whitespace. Technically we only needed
// to strip comments for the () case below, which is the complicated one
// for ragel, but since we're doing it anyways it seemed simpler to do
// universally and take comment-handling out of ragel as well.
// 2) Gets rid of all awful rfc1035 () line continuation, replacing the
// parentheses themselves with spaces, and replacing any embedded newlines
// with the formfeed character \f (which the ragel parser treats as
// whitespace, but also knows to increment linecount on these so that error
// reporting still shows the correct line number).

#define preproc_err(_msg) \
do {\
log_err("rfc1035: Zone %s: Zonefile preprocessing error at file %s line %lu: " _msg, logf_dname(z->zone->dname), z->curfn, line_num);\
siglongjmp(z->jbuf, 1);\
} while (0)

F_NONNULL
static void close_paren(zscan_t* z)
static void preprocess_buf(zscan_t* z, char* buf, const size_t buflen)
{
if (!z->in_paren)
parse_error_noargs("Parenthetical error: unnecessary close");
z->in_paren = false;
// This is validated with a user-facing error before calling this function!
gdnsd_assert(buf[buflen - 1] == '\n');

bool in_quotes = false;
bool in_parens = false;
size_t line_num = 1;
for (size_t i = 0; i < buflen; i++) {
switch (buf[i]) {
case '\n':
line_num++;
// In parens, replace \n with \f. The ragel parser treats \f as
// whitespace but knows to increment the line count so that error
// reports are sane, while true unescaped \n terminates records.
if (in_parens && !in_quotes)
buf[i] = '\f';
break;
case ';':
if (!in_quotes) {
// Note we don't check i < buflen while advancing here, because
// there's a check that the final character of the buffer must
// be '\n' before the preprocessor is even invoked, which is
// re-asserted at the top of this function.
do {
buf[i++] = ' ';
} while (buf[i] != '\n');
line_num++;
if (in_parens)
buf[i] = '\f';
}
break;
case '"':
in_quotes = !in_quotes;
break;
case '(':
if (!in_quotes) {
if (in_parens)
preproc_err("Parentheses double-opened");
in_parens = true;
buf[i] = ' ';
}
break;
case ')':
if (!in_quotes) {
if (!in_parens)
preproc_err("Parentheses double-closed");
in_parens = false;
buf[i] = ' ';
}
break;
case '\\':
// Skip one escaped char. Note 3-digit escapes exist as well, but
// we're only concerned here with escaping of metachars, so it
// turns out we don't have to track for the 3-digit escapes here.
// We do have to keep the line count accurate in the case of an
// escaped newline, though.
if (buf[++i] == '\n')
line_num++;
break;
case '\f':
// Because \f is a special metachar for our ()-handling
if (!in_quotes)
preproc_err("Literal formfeed character not allowed in unquoted text: please escape it!");
break;
default:
break;
}
}

if (in_quotes)
preproc_err("Unterminated open double-quote at EOF");
if (in_parens)
preproc_err("Unterminated open parentheses at EOF");
}

// *INDENT-OFF*
Expand Down Expand Up @@ -797,23 +884,14 @@ static void close_paren(zscan_t* z)

action rfc3597_data_setup { rfc3597_data_setup(z); }
action rfc3597_octet { rfc3597_octet(z); }
action open_paren { open_paren(z); }
action close_paren { close_paren(z); }
action in_paren { z->in_paren }

# newlines, count them
nl = '\n' %{ z->lcount++; };

# Single Line Comment, e.g. ; dns comment
slc = ';' [^\n]*;

# Whitespace, with special handling for braindead () multi-line records
ws = (
[ \t]+
| '(' $open_paren
| ')' $close_paren
| (slc? nl)+ when in_paren
)+;
# Whitespace: note we use the special metachar \f as a linecount-bumping
# whitespace, to coordinate with the preprocessor's removal of true
# newlines within parentheses.
ws = ( [ \t] | ('\f' %{ z->lcount++; } ))+;

# Escape sequences in general for any character-string
# (domainname or TXT record rdata, etc)
Expand All @@ -825,7 +903,7 @@ static void close_paren(zscan_t* z)

# The base set of literal characters allowed in unquoted character
# strings (again, labels or txt rdata chunks)
lit_chr = [^; \t"\n\\)(];
lit_chr = [^; \t\f"\n\\)(];

# plugin / resource names for DYNA
plugres = ((lit_chr - [!]) | escapes)+;
Expand Down Expand Up @@ -946,14 +1024,13 @@ static void close_paren(zscan_t* z)
# A zonefile is composed of many resource records
# and commands and comments and such...
statement = rr | cmd;
main := (statement? ws? ((slc? nl) when !in_paren))*;
main := (statement? ws? nl)*;

write data;
}%%
// *INDENT-ON*

F_NONNULL
static void scanner(zscan_t* z, const char* buf, const size_t bufsize)
static void scanner(zscan_t* z, char* buf, const size_t bufsize)
{
gdnsd_assert(bufsize);

Expand All @@ -966,6 +1043,9 @@ static void scanner(zscan_t* z, const char* buf, const size_t bufsize)
return;
}

// Undo parentheses braindamage before real parsing
preprocess_buf(z, buf, bufsize);

(void)zone_en_main; // silence unused var warning from generated code

int cs = zone_start;
Expand All @@ -978,9 +1058,7 @@ static void scanner(zscan_t* z, const char* buf, const size_t bufsize)
const char* p = buf;
const char* pe = buf + bufsize;
const char* eof = pe;
// *INDENT-OFF*
%% write exec;
// *INDENT-ON*
#endif // __clang_analyzer__
GDNSD_DIAG_POP
GDNSD_DIAG_POP
Expand All @@ -990,10 +1068,3 @@ static void scanner(zscan_t* z, const char* buf, const size_t bufsize)
else if (cs < zone_first_final)
parse_error_noargs("Trailing incomplete or unparseable record at end of file");
}

bool zscan_rfc1035(zone_t* zone, const char* fn)
{
gdnsd_assert(zone->dname);
log_debug("rfc1035: Scanning zonefile '%s'", logf_dname(zone->dname));
return zscan_do(zone, zone->dname, fn, gcfg->zones_default_ttl, 0, 0);
}

0 comments on commit ef58705

Please sign in to comment.