Large diffs are not rendered by default.

@@ -1,3 +1,6 @@
/*!re2c re2c:flags:no-debug-info = 1; */
/*!re2c re2c:indent:string = ' '; */

#include <stdlib.h>
#include "ext_scanners.h"

@@ -22,15 +25,14 @@ bufsize_t _ext_scan_at(bufsize_t (*scanner)(const unsigned char *), unsigned cha
re2c:define:YYCTYPE = "unsigned char";
re2c:define:YYCURSOR = p;
re2c:define:YYMARKER = marker;
re2c:define:YYCTXMARKER = marker;
re2c:yyfill:enable = 0;
spacechar = [ \t\v\f];
newline = [\r]?[\n];
escaped_char = [\\][|!"#$%&'()*+,./:;<=>?@[\\\]^_`{}~-];
table_marker = (spacechar*[:]?[-]+[:]?spacechar*);
table_cell = (escaped_char|[^|\r\n])*;
table_cell = (escaped_char|[^|\r\n])+;
tasklist = spacechar*("-"|"+"|"*"|[0-9]+.)spacechar+("[ ]"|"[x]")spacechar+;
*/
@@ -39,47 +41,52 @@ bufsize_t _scan_table_start(const unsigned char *p)
{
const unsigned char *marker = NULL;
const unsigned char *start = p;
/*!re2c
[|]? table_marker ([|] table_marker)* [|]? spacechar* newline { return (bufsize_t)(p - start); }
.? { return 0; }
*/
/*!re2c
[|]? table_marker ([|] table_marker)* [|]? spacechar* newline {
return (bufsize_t)(p - start);
}
* { return 0; }
*/
}

bufsize_t _scan_table_cell(const unsigned char *p)
{
const unsigned char *marker = NULL;
const unsigned char *start = p;
/*!re2c
table_cell { return (bufsize_t)(p - start); }
.? { return 0; }
*/
/*!re2c
// In fact, `table_cell` matches non-empty table cells only. The empty
// string is also a valid table cell, but is handled by the default rule.
// This approach prevents re2c's match-empty-string warning.
table_cell { return (bufsize_t)(p - start); }
* { return 0; }
*/
}

bufsize_t _scan_table_cell_end(const unsigned char *p)
{
const unsigned char *marker = NULL;
const unsigned char *start = p;
/*!re2c
[|] spacechar* newline? { return (bufsize_t)(p - start); }
.? { return 0; }
*/
/*!re2c
[|] spacechar* { return (bufsize_t)(p - start); }
* { return 0; }
*/
}

bufsize_t _scan_table_row_end(const unsigned char *p)
{
const unsigned char *marker = NULL;
const unsigned char *start = p;
/*!re2c
spacechar* newline { return (bufsize_t)(p - start); }
.? { return 0; }
*/
/*!re2c
spacechar* newline { return (bufsize_t)(p - start); }
* { return 0; }
*/
}

bufsize_t _scan_tasklist(const unsigned char *p)
{
const unsigned char *marker = NULL;
const unsigned char *start = p;
/*!re2c
tasklist { return (bufsize_t)(p - start); }
.? { return 0; }
*/
/*!re2c
tasklist { return (bufsize_t)(p - start); }
* { return 0; }
*/
}
@@ -114,60 +114,87 @@ static cmark_strbuf *unescape_pipes(cmark_mem *mem, unsigned char *string, bufsi
static table_row *row_from_string(cmark_syntax_extension *self,
cmark_parser *parser, unsigned char *string,
int len) {
// Parses a single table row. It has the following form:
// `delim? table_cell (delim table_cell)* delim? newline`
// Note that cells are allowed to be empty.
//
// From the GitHub-flavored Markdown specification:
//
// > Each row consists of cells containing arbitrary text, in which inlines
// > are parsed, separated by pipes (|). A leading and trailing pipe is also
// > recommended for clarity of reading, and if there’s otherwise parsing
// > ambiguity.

table_row *row = NULL;
bufsize_t cell_matched = 1, pipe_matched = 1, offset;
int cell_end_offset;
int expect_more_cells = 1;
int row_end_offset = 0;

row = (table_row *)parser->mem->calloc(1, sizeof(table_row));
row->n_columns = 0;
row->cells = NULL;

// Scan past the (optional) leading pipe.
offset = scan_table_cell_end(string, len, 0);

// Parse the cells of the row. Stop if we reach the end of the input, or if we
// cannot detect any more cells.
while (offset < len && (cell_matched || pipe_matched)) {
while (offset < len && expect_more_cells) {
cell_matched = scan_table_cell(string, len, offset);
pipe_matched = scan_table_cell_end(string, len, offset + cell_matched);

if (cell_matched || pipe_matched) {
cell_end_offset = offset + cell_matched - 1;
// We are guaranteed to have a cell, since (1) either we found some
// content and cell_matched, or (2) we found an empty cell followed by a
// pipe.
cmark_strbuf *cell_buf = unescape_pipes(parser->mem, string + offset,
cell_matched);
cmark_strbuf_trim(cell_buf);

node_cell *cell = (node_cell *)parser->mem->calloc(1, sizeof(*cell));
cell->buf = cell_buf;
cell->start_offset = offset;
cell->end_offset = offset + cell_matched - 1;

while (cell->start_offset > 0 && string[cell->start_offset - 1] != '|') {
--cell->start_offset;
++cell->internal_offset;
}

row->n_columns += 1;
row->cells = cmark_llist_append(parser->mem, row->cells, cell);
}

offset += cell_matched + pipe_matched;

if (pipe_matched) {
expect_more_cells = 1;
} else {
// We've scanned the last cell. Check if we have reached the end of the row
row_end_offset = scan_table_row_end(string, len, offset);
offset += row_end_offset;

if (string[cell_end_offset] == '\n' || string[cell_end_offset] == '\r') {
row->paragraph_offset = cell_end_offset;
// If the end of the row is not the end of the input,
// the row is not a real row but potentially part of the paragraph
// preceding the table.
if (row_end_offset && offset != len) {
row->paragraph_offset = offset;

cmark_llist_free_full(parser->mem, row->cells, (cmark_free_func)free_table_cell);
row->cells = NULL;
row->n_columns = 0;
} else {
cmark_strbuf *cell_buf = unescape_pipes(parser->mem, string + offset,
cell_matched);
cmark_strbuf_trim(cell_buf);

node_cell *cell = (node_cell *)parser->mem->calloc(1, sizeof(*cell));
cell->buf = cell_buf;
cell->start_offset = offset;
cell->end_offset = cell_end_offset;

while (cell->start_offset > 0 && string[cell->start_offset - 1] != '|') {
--cell->start_offset;
++cell->internal_offset;
}

row->n_columns += 1;
row->cells = cmark_llist_append(parser->mem, row->cells, cell);
}
}
// Scan past the (optional) leading pipe.
offset += scan_table_cell_end(string, len, offset);

offset += cell_matched + pipe_matched;

if (!pipe_matched) {
pipe_matched = scan_table_row_end(string, len, offset);
offset += pipe_matched;
expect_more_cells = 1;
} else {
expect_more_cells = 0;
}
}
}

if (offset != len || !row->n_columns) {
if (offset != len || row->n_columns == 0) {
free_table_row(parser->mem, row);
row = NULL;
}
@@ -199,50 +226,44 @@ static cmark_node *try_opening_table_header(cmark_syntax_extension *self,
cmark_parser *parser,
cmark_node *parent_container,
unsigned char *input, int len) {
bufsize_t matched =
scan_table_start(input, len, cmark_parser_get_first_nonspace(parser));
cmark_node *table_header;
table_row *header_row = NULL;
table_row *marker_row = NULL;
node_table_row *ntr;
const char *parent_string;
uint16_t i;

if (!matched)
return parent_container;

parent_string = cmark_node_get_string_content(parent_container);

cmark_arena_push();

header_row = row_from_string(self, parser, (unsigned char *)parent_string,
(int)strlen(parent_string));

if (!header_row) {
free_table_row(parser->mem, header_row);
cmark_arena_pop();
if (!scan_table_start(input, len, cmark_parser_get_first_nonspace(parser))) {
return parent_container;
}

// Since scan_table_start was successful, we must have a marker row.
marker_row = row_from_string(self, parser,
input + cmark_parser_get_first_nonspace(parser),
len - cmark_parser_get_first_nonspace(parser));

assert(marker_row);

if (header_row->n_columns != marker_row->n_columns) {
free_table_row(parser->mem, header_row);
cmark_arena_push();

// Check for a matching header row. We call `row_from_string` with the entire
// (potentially long) parent container as input, but this should be safe since
// `row_from_string` bails out early if it does not find a row.
parent_string = cmark_node_get_string_content(parent_container);
header_row = row_from_string(self, parser, (unsigned char *)parent_string,
(int)strlen(parent_string));
if (!header_row || header_row->n_columns != marker_row->n_columns) {
free_table_row(parser->mem, marker_row);
free_table_row(parser->mem, header_row);
cmark_arena_pop();
return parent_container;
}

if (cmark_arena_pop()) {
marker_row = row_from_string(
self, parser, input + cmark_parser_get_first_nonspace(parser),
len - cmark_parser_get_first_nonspace(parser));
header_row = row_from_string(self, parser, (unsigned char *)parent_string,
(int)strlen(parent_string));
marker_row = row_from_string(self, parser,
input + cmark_parser_get_first_nonspace(parser),
len - cmark_parser_get_first_nonspace(parser));
}

if (!cmark_node_set_type(parent_container, CMARK_NODE_TABLE)) {
@@ -257,9 +278,7 @@ static cmark_node *try_opening_table_header(cmark_syntax_extension *self,
}

cmark_node_set_syntax_extension(parent_container, self);

parent_container->as.opaque = parser->mem->calloc(1, sizeof(node_table));

set_n_table_columns(parent_container, header_row->n_columns);

uint8_t *alignments =
@@ -87,6 +87,9 @@ def badhash(ref):
"unclosed links B":
("[a](b" * 30000,
re.compile("(\[a\]\(b){30000}")),
"tables":
("aaa\rbbb\n-\v\n" * 30000,
re.compile("^<p>aaa</p>\n<table>\n<thead>\n<tr>\n<th>bbb</th>\n</tr>\n</thead>\n<tbody>\n(<tr>\n<td>aaa</td>\n</tr>\n<tr>\n<td>bbb</td>\n</tr>\n<tr>\n<td>-\x0b</td>\n</tr>\n){29999}</tbody>\n</table>\n$")),
# "many references":
# ("".join(map(lambda x: ("[" + str(x) + "]: u\n"), range(1,5000 * 16))) + "[0] " * 5000,
# re.compile("(\[0\] ){4999}")),
@@ -106,7 +109,7 @@ def run_test(inp, regex):
parser.add_argument('--library-dir', dest='library_dir', nargs='?',
default=None, help='directory containing dynamic library')
args = parser.parse_args(sys.argv[1:])
cmark = CMark(prog=args.program, library_dir=args.library_dir)
cmark = CMark(prog=args.program, library_dir=args.library_dir, extensions="table")

[rc, actual, err] = cmark.to_html(inp)
if rc != 0: