Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
174 commits
Select commit Hold shift + click to select a range
c822e73
common : implement parser combinators to simplify chat parsing
aldehir Nov 10, 2025
e6153bb
add virtual destructor to parser_base
aldehir Nov 10, 2025
4ced999
fix memory leak from circular references of rules
aldehir Nov 10, 2025
2a9a13d
implement gbnf grammar building
aldehir Nov 10, 2025
2286532
remove unused private variable
aldehir Nov 10, 2025
3e6662f
create a base visitor and implement id assignment as a visitor
aldehir Nov 11, 2025
76cf0b5
fix const ref for grammar builder
aldehir Nov 11, 2025
9c7b3e8
clean up types, friend classes, and class declarations
aldehir Nov 11, 2025
f02e2b0
remove builder usage from until_parser
aldehir Nov 11, 2025
66cf038
Use a counter class to help assign rule ids
aldehir Nov 11, 2025
2b3caef
cache everything
aldehir Nov 11, 2025
adac6ba
add short description for each parser
aldehir Nov 11, 2025
0be2a93
create a type for the root parser
aldehir Nov 11, 2025
31b386f
implement repetition parser
aldehir Nov 11, 2025
ffb7a6f
Make optional, one_or_more, and zero_or_more subclasses of repetition
aldehir Nov 11, 2025
085404a
improve context constructor
aldehir Nov 11, 2025
6bd9a95
improve until parsing and add benchmarks
aldehir Nov 12, 2025
62656db
remove cached() pattern, cache in parser_base with specialized parsin…
aldehir Nov 12, 2025
18557f3
improve json parsing performance to better match legacy parsing
aldehir Nov 12, 2025
f6aa608
fix const auto * it for windows
aldehir Nov 12, 2025
d58dace
move id assignment to classes instead of using a visitor
aldehir Nov 12, 2025
20f9a1b
create named rules in the command r7b example
aldehir Nov 12, 2025
35b1640
use '.' for any in GBNF
aldehir Nov 12, 2025
bcb1c03
fix parens around choices in gbnf grammar
aldehir Nov 12, 2025
4bed84d
add convenience operators to turn strings to literals
aldehir Nov 12, 2025
c02aaa6
add free-form operators for const char * to simplify defining literals
aldehir Nov 12, 2025
8e82127
simplify test case parser
aldehir Nov 12, 2025
9685b69
implement semantic actions
aldehir Nov 12, 2025
d9a6229
remove groups in favor of actions and a scratchpad
aldehir Nov 12, 2025
117d908
add built in actions for common operations
aldehir Nov 12, 2025
f97abde
add actions to command r7b example
aldehir Nov 12, 2025
3114a0e
use std::default_searcher for platforms that don't have bm
aldehir Nov 12, 2025
cc4d52c
improve parser_type handling and add cast helper
aldehir Nov 13, 2025
c119c12
add partial result type to better control when to run actions
aldehir Nov 13, 2025
39d1095
fix bug in until()
aldehir Nov 13, 2025
eabdb85
run actions on partial results by default
aldehir Nov 13, 2025
692ade2
use common_chat_msg for result
aldehir Nov 13, 2025
6478050
add qwen3 example wip
aldehir Nov 13, 2025
bbdf45f
trash partial idea and simplify
aldehir Nov 13, 2025
dd06972
move action arguments to a struct
aldehir Nov 13, 2025
94bd700
implement aho-corasick matcher for until_parser and to build exclusio…
aldehir Nov 14, 2025
599e2fd
use std::string for input, since std::string_view is incompatible wit…
aldehir Nov 14, 2025
c40b03e
Refactor tests
pwilkin Nov 14, 2025
7745261
improve qwen3 example
aldehir Nov 14, 2025
7492123
implement sax-style parsing and refactor
aldehir Nov 14, 2025
843a279
fix json string in test
aldehir Nov 14, 2025
9f9fd1c
rename classes to use common_chat_ prefix
aldehir Nov 14, 2025
7f92bcf
remove is_ suffix from functions
aldehir Nov 14, 2025
87b92af
rename from id_counter to just counter
aldehir Nov 14, 2025
b1aadf8
Final refactored tests
pwilkin Nov 14, 2025
dcace46
Fix executable name and editorconfig-checker
pwilkin Nov 14, 2025
107000f
Third time's the charm...
pwilkin Nov 14, 2025
4ebddbd
add trigger parser to begin lazy grammar rule generation
aldehir Nov 14, 2025
68f003b
working lazy grammar
aldehir Nov 14, 2025
9f09c9f
refactor json rules now that we check for reachability
aldehir Nov 14, 2025
bee5eb4
reduce pointer usage
aldehir Nov 15, 2025
4228d11
print out grammars in example
aldehir Nov 15, 2025
0f0ece9
merge
aldehir Nov 15, 2025
ea519ca
rename to chat-peg-parser* and common_chat_peg_parser*
aldehir Nov 15, 2025
15564f3
Revert unrelated changes
pwilkin Nov 15, 2025
6dd6cee
New macros for CMakeLists to enable multi-file compilations
pwilkin Nov 15, 2025
9ebdd64
starting unicode support
aldehir Nov 15, 2025
c8d94d1
add unicode support to char_parser
aldehir Nov 15, 2025
befca67
use unparsed args as additional sources
aldehir Nov 15, 2025
c077792
Refactor tests to new harness
pwilkin Nov 15, 2025
3e401ba
Fix CMakeLists
pwilkin Nov 15, 2025
3389bb7
fix rate calculation
aldehir Nov 15, 2025
715ab56
add unicode tests
aldehir Nov 15, 2025
600e589
fix trailing whitespace and line endings
aldehir Nov 15, 2025
ae31b32
Helpers + rewrite qwen3 with helpers
pwilkin Nov 16, 2025
ed4b1d0
Fix whitespace
pwilkin Nov 16, 2025
57f03e2
extract unicode functions to separate file
aldehir Nov 16, 2025
9ff6486
refactor parse unicode function
aldehir Nov 16, 2025
6f662e0
fix compiler error
aldehir Nov 16, 2025
9c02bf5
improve construction of sequence/choice parsers
aldehir Nov 16, 2025
5ee2c2d
be less clever
aldehir Nov 16, 2025
a1f461a
add make_parser helper function
aldehir Nov 16, 2025
086ba59
expand usage of make_parser, alias common_chat_msg_peg_parser_builder…
aldehir Nov 16, 2025
15c0d85
lower bench iterations
aldehir Nov 16, 2025
68abea7
add unicode support to until_parser
aldehir Nov 16, 2025
15bb3ca
add unicode support to json_string_parser
aldehir Nov 16, 2025
8928c2a
clean up unicode tests
aldehir Nov 16, 2025
5102b41
reduce unicode details to match src/unicode.cpp
aldehir Nov 16, 2025
2b1e4de
simplify even further
aldehir Nov 16, 2025
045da9e
remove unused functions
aldehir Nov 16, 2025
3d78144
fix type
aldehir Nov 16, 2025
d2b4a4a
reformat char class parsing
aldehir Nov 16, 2025
3da306b
clean up json string parser
aldehir Nov 16, 2025
26c9553
clean up + fix diagnostics
aldehir Nov 16, 2025
175cb57
reorder includes
aldehir Nov 16, 2025
f5af89a
compact builder functions
aldehir Nov 16, 2025
9199b00
replace action_parser with capture_parser, rename env to semantics
aldehir Nov 16, 2025
0c162a0
rename env to semantics
aldehir Nov 16, 2025
bea64a0
clean up common_chat_parse_context
aldehir Nov 16, 2025
27ffc9f
move type() to below constant
aldehir Nov 16, 2025
425863e
use default constructor for common_chat_peg_parser
aldehir Nov 16, 2025
4413c5c
make all operators functions for consistency
aldehir Nov 16, 2025
817a0eb
fix compilation errors in test-optional.cpp
aldehir Nov 16, 2025
f41539b
simplify result values
aldehir Nov 16, 2025
7cf9b73
rename json_string_unquoted to json_string_content
aldehir Nov 16, 2025
c0faa27
Move helper to separate class, add separate explicit and helper classes
pwilkin Nov 16, 2025
851b070
Whitespace
pwilkin Nov 16, 2025
09976dd
Change + to append()
pwilkin Nov 16, 2025
a1fc700
Reformat
pwilkin Nov 16, 2025
d0c83f8
Add extra helpers, tests and Minimax example
pwilkin Nov 16, 2025
bbcf1f6
Add some extra optional debugging prints + real example of how to use…
pwilkin Nov 16, 2025
8b1c306
fix bug in repetitions when min_count = 0 reports failures
aldehir Nov 16, 2025
b890bc7
dump rule in debug
aldehir Nov 16, 2025
c54cac7
fix token accumulation and assert parsing never fails
aldehir Nov 16, 2025
8756a3e
indent debug by depth
aldehir Nov 16, 2025
1239e10
use LOG_* in tests so logs sync up with test logs
aldehir Nov 16, 2025
7d30b27
- Add selective testing
pwilkin Nov 16, 2025
9e787d7
refactor rule() and introduce ref()
aldehir Nov 16, 2025
38a8fd6
clean up visitor
aldehir Nov 16, 2025
362cb6a
clean up indirection in root parser w.r.t rules
aldehir Nov 16, 2025
0482db6
store shared ptr directly in parser classes
aldehir Nov 17, 2025
677c17d
replace aho-corasick automation with a simple trie
aldehir Nov 17, 2025
7f7f9cf
merge
aldehir Nov 17, 2025
c50f2bc
Reset prev for qwen3 helper example variant
aldehir Nov 17, 2025
0e907b9
refactor to use value semantics with std::variant/std::visit
aldehir Nov 17, 2025
7daf9b2
simplify trie_matcher result
aldehir Nov 17, 2025
96a6980
fix linting issues
aldehir Nov 17, 2025
841bd62
add annotations to rules
aldehir Nov 17, 2025
9b3d4f2
revert test workaround
aldehir Nov 17, 2025
305ed3e
implement serializing the parser
aldehir Nov 18, 2025
3948cdf
remove redundant parsers
aldehir Nov 18, 2025
0de56d6
remove tests
aldehir Nov 18, 2025
37d5faf
gbnf generation fixes
aldehir Nov 18, 2025
e09378a
remove LOG_* use in tests
aldehir Nov 18, 2025
92d38ea
update gbnf tests to test entire grammar
aldehir Nov 19, 2025
0e1989e
clean up gbnf generation and fix a few bugs
aldehir Nov 19, 2025
ce15c64
fix typo in test output
aldehir Nov 19, 2025
3cd2af4
remove implicit conversion rules
aldehir Nov 19, 2025
fe09236
improve test output
aldehir Nov 19, 2025
b342457
rename trie_matcher to trie
aldehir Nov 19, 2025
4fb8b9d
simplify trie to just know if a node is the end of a word
aldehir Nov 19, 2025
fc83502
remove common_chat_ prefix and ensure a common_peg_ prefix to all types
aldehir Nov 20, 2025
8c24653
rename chat-peg-parser -> peg-parser
aldehir Nov 20, 2025
c4ce858
promote chat-peg-parser-helper to chat-peg-parser
aldehir Nov 20, 2025
b834f00
checkpoint
aldehir Nov 21, 2025
9b3bbd3
use a static_assert to ensure we handle every branch
aldehir Nov 22, 2025
d38b741
inline trivial peg parser builders
aldehir Nov 22, 2025
54de91f
use json strings for now
aldehir Nov 22, 2025
27fb129
implement basic and native chat peg parser builders/extractors
aldehir Nov 22, 2025
4c61d93
resolve refs to their rules
aldehir Nov 22, 2025
e57c201
remove packrat caching (for now)
aldehir Nov 22, 2025
221f4fe
update tests
aldehir Nov 22, 2025
d14edcb
compare parsers with incremental input
aldehir Nov 22, 2025
0d014e1
Merge branch 'upstream/master' into parser-combinators
aldehir Nov 22, 2025
90c7fb0
benchmark both complete and incremental parsing
aldehir Nov 22, 2025
60c93ea
add raw string generation from json schema
aldehir Nov 22, 2025
aa5043b
add support for string schemas in gbnf generation
aldehir Nov 22, 2025
2b72bfe
fix qwen example to include \n
aldehir Nov 22, 2025
2323c00
tidy up example
aldehir Nov 22, 2025
0243c56
rename extractor to mapper
aldehir Nov 22, 2025
6c1a1a8
rename ast_arena to ast
aldehir Nov 22, 2025
40e46b2
place basic tests into one
aldehir Nov 22, 2025
d83db0b
use gbnf_format_literal from json-schema-to-grammar
aldehir Nov 22, 2025
89a80c7
integrate parser with common/chat and server
aldehir Nov 23, 2025
eee2c8b
clean up schema and serialization
aldehir Nov 23, 2025
57e8cd8
add json-schema raw string tests
aldehir Nov 23, 2025
3aa4d0a
clean up json creation and remove capture parser
aldehir Nov 23, 2025
94de8b8
trim spaces from reasoning and content
aldehir Nov 23, 2025
4b1bc16
clean up redundant rules and comments
aldehir Nov 23, 2025
1e61ffb
rename input_is_complete to is_partial to match rest of project
aldehir Nov 23, 2025
3c57fab
simplify json rules
aldehir Nov 23, 2025
1ff1894
remove extraneous file
aldehir Nov 23, 2025
a5e7a1c
remove comment
aldehir Nov 23, 2025
449d954
implement += and |= operators
aldehir Nov 23, 2025
39b8213
add comments to qwen3 implementation
aldehir Nov 23, 2025
3fdfa7c
reorder arguments to common_chat_peg_parse
aldehir Nov 23, 2025
2ab9959
remove commented outdated tests
aldehir Nov 23, 2025
2a8bda3
add explicit copy constructor
aldehir Nov 23, 2025
3ff149e
fix operators and constness
aldehir Nov 23, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions common/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ add_library(${TARGET} STATIC
chat-parser.h
chat-parser-xml-toolcall.h
chat-parser-xml-toolcall.cpp
chat-peg-parser.cpp
chat-peg-parser.h
chat.cpp
chat.h
common.cpp
Expand All @@ -69,12 +71,16 @@ add_library(${TARGET} STATIC
log.h
ngram-cache.cpp
ngram-cache.h
peg-parser.cpp
peg-parser.h
regex-partial.cpp
regex-partial.h
sampling.cpp
sampling.h
speculative.cpp
speculative.h
unicode.cpp
unicode.h
)

if (BUILD_SHARED_LIBS)
Expand Down
111 changes: 111 additions & 0 deletions common/chat-peg-parser.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
#include "chat-peg-parser.h"

#include <nlohmann/json.hpp>

using json = nlohmann::json;

static std::string_view trim_trailing_space(std::string_view sv) {
while (!sv.empty() && std::isspace(static_cast<unsigned char>(sv.back()))) {
sv.remove_suffix(1);
}
return sv;
}

void common_chat_peg_mapper::from_ast(const common_peg_ast_arena & arena, const common_peg_parse_result & result) {
arena.visit(result, [this](const common_peg_ast_node & node) {
map(node);
});
}

void common_chat_peg_mapper::map(const common_peg_ast_node & node) {
bool is_reasoning = node.tag == common_chat_peg_builder::REASONING;
bool is_content = node.tag == common_chat_peg_builder::CONTENT;

if (is_reasoning) {
result.reasoning_content = std::string(trim_trailing_space(node.text));
}

if (is_content) {
result.content = std::string(trim_trailing_space(node.text));
}
}

void common_chat_peg_native_mapper::map(const common_peg_ast_node & node) {
common_chat_peg_mapper::map(node);

bool is_tool_open = node.tag == common_chat_peg_native_builder::TOOL_OPEN;
bool is_tool_name = node.tag == common_chat_peg_native_builder::TOOL_NAME;
bool is_tool_id = node.tag == common_chat_peg_native_builder::TOOL_ID;
bool is_tool_args = node.tag == common_chat_peg_native_builder::TOOL_ARGS;

if (is_tool_open) {
result.tool_calls.emplace_back();
current_tool = &result.tool_calls.back();
}

if (is_tool_id && current_tool) {
current_tool->id = std::string(node.text);
}

if (is_tool_name && current_tool) {
current_tool->name = std::string(node.text);
}

if (is_tool_args && current_tool) {
current_tool->arguments = std::string(node.text);
}
}

void common_chat_peg_constructed_mapper::map(const common_peg_ast_node & node) {
common_chat_peg_mapper::map(node);

bool is_tool_name = node.tag == common_chat_peg_constructed_builder::TOOL_NAME;
bool is_tool_close = node.tag == common_chat_peg_constructed_builder::TOOL_CLOSE;
bool is_arg_open = node.tag == common_chat_peg_constructed_builder::TOOL_ARG_OPEN;
bool is_arg_close = node.tag == common_chat_peg_constructed_builder::TOOL_ARG_CLOSE;
bool is_arg_name = node.tag == common_chat_peg_constructed_builder::TOOL_ARG_NAME;
bool is_arg_string = node.tag == common_chat_peg_constructed_builder::TOOL_ARG_STRING_VALUE;
bool is_arg_json = node.tag == common_chat_peg_constructed_builder::TOOL_ARG_JSON_VALUE;

if (is_tool_name) {
result.tool_calls.emplace_back();
current_tool = &result.tool_calls.back();
arg_count = 0;

current_tool->name = std::string(node.text);
current_tool->arguments = "{";
}

if (is_arg_open) {
needs_closing_quote = false;
}

if (is_arg_name && current_tool) {
if (arg_count > 0) {
current_tool->arguments += ",";
}
current_tool->arguments += json(node.text).dump() + ":";
++arg_count;
}

if (is_arg_string && current_tool) {
// Serialize to JSON, but exclude the end quote
std::string dumped = json(node.text).dump();
current_tool->arguments += dumped.substr(0, dumped.size() - 1);
needs_closing_quote = true;
}

if (is_arg_close && current_tool) {
if (needs_closing_quote) {
current_tool->arguments += "\"";
}
}

if (is_arg_json && current_tool) {
current_tool->arguments += std::string(node.text);
}

if (is_tool_close && current_tool) {
current_tool->arguments += "}";
}
}
105 changes: 105 additions & 0 deletions common/chat-peg-parser.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
#pragma once

#include "chat.h"
#include "peg-parser.h"

class common_chat_peg_builder : public common_peg_parser_builder {
public:
static constexpr const char * REASONING_BLOCK = "reasoning-block";
static constexpr const char * REASONING = "reasoning";
static constexpr const char * CONTENT = "content";

common_peg_parser reasoning_block(const common_peg_parser & p) { return tag(REASONING_BLOCK, p); }
common_peg_parser reasoning(const common_peg_parser & p) { return tag(REASONING, p); }
common_peg_parser content(const common_peg_parser & p) { return tag(CONTENT, p); }
};

inline common_peg_arena build_chat_peg_parser(const std::function<common_peg_parser(common_chat_peg_builder & builder)> & fn) {
common_chat_peg_builder builder;
builder.set_root(fn(builder));
return builder.build();
}

class common_chat_peg_mapper {
public:
common_chat_msg & result;

common_chat_peg_mapper(common_chat_msg & msg) : result(msg) {}

virtual void from_ast(const common_peg_ast_arena & arena, const common_peg_parse_result & result);
virtual void map(const common_peg_ast_node & node);
};

class common_chat_peg_native_builder : public common_chat_peg_builder {
public:
static constexpr const char * TOOL = "tool";
static constexpr const char * TOOL_OPEN = "tool-open";
static constexpr const char * TOOL_CLOSE = "tool-close";
static constexpr const char * TOOL_ID = "tool-id";
static constexpr const char * TOOL_NAME = "tool-name";
static constexpr const char * TOOL_ARGS = "tool-args";

common_peg_parser tool(const common_peg_parser & p) { return tag(TOOL, p); }
common_peg_parser tool_open(const common_peg_parser & p) { return atomic(tag(TOOL_OPEN, p)); }
common_peg_parser tool_close(const common_peg_parser & p) { return atomic(tag(TOOL_CLOSE, p)); }
common_peg_parser tool_id(const common_peg_parser & p) { return atomic(tag(TOOL_ID, p)); }
common_peg_parser tool_name(const common_peg_parser & p) { return atomic(tag(TOOL_NAME, p)); }
common_peg_parser tool_args(const common_peg_parser & p) { return tag(TOOL_ARGS, p); }
};

class common_chat_peg_native_mapper : public common_chat_peg_mapper {
common_chat_tool_call * current_tool;

public:
common_chat_peg_native_mapper(common_chat_msg & msg) : common_chat_peg_mapper(msg) {}

void map(const common_peg_ast_node & node) override;
};

inline common_peg_arena build_chat_peg_native_parser(const std::function<common_peg_parser(common_chat_peg_native_builder & builder)> & fn) {
common_chat_peg_native_builder builder;
builder.set_root(fn(builder));
return builder.build();
}

class common_chat_peg_constructed_builder : public common_chat_peg_builder {
public:
static constexpr const char * TOOL = "tool";
static constexpr const char * TOOL_OPEN = "tool-open";
static constexpr const char * TOOL_CLOSE = "tool-close";
static constexpr const char * TOOL_NAME = "tool-name";
static constexpr const char * TOOL_ARG = "tool-arg";
static constexpr const char * TOOL_ARG_OPEN = "tool-arg-open";
static constexpr const char * TOOL_ARG_CLOSE = "tool-arg-close";
static constexpr const char * TOOL_ARG_NAME = "tool-arg-name";
static constexpr const char * TOOL_ARG_STRING_VALUE = "tool-arg-string-value";
static constexpr const char * TOOL_ARG_JSON_VALUE = "tool-arg-json-value";

common_peg_parser tool(const common_peg_parser & p) { return tag(TOOL, p); }
common_peg_parser tool_open(const common_peg_parser & p) { return atomic(tag(TOOL_OPEN, p)); }
common_peg_parser tool_close(const common_peg_parser & p) { return atomic(tag(TOOL_CLOSE, p)); }
common_peg_parser tool_name(const common_peg_parser & p) { return atomic(tag(TOOL_NAME, p)); }
common_peg_parser tool_arg(const common_peg_parser & p) { return tag(TOOL_ARG, p); }
common_peg_parser tool_arg_open(const common_peg_parser & p) { return atomic(tag(TOOL_ARG_OPEN, p)); }
common_peg_parser tool_arg_close(const common_peg_parser & p) { return atomic(tag(TOOL_ARG_CLOSE, p)); }
common_peg_parser tool_arg_name(const common_peg_parser & p) { return atomic(tag(TOOL_ARG_NAME, p)); }
common_peg_parser tool_arg_string_value(const common_peg_parser & p) { return tag(TOOL_ARG_STRING_VALUE, p); }
common_peg_parser tool_arg_json_value(const common_peg_parser & p) { return tag(TOOL_ARG_JSON_VALUE, p); }
};

class common_chat_peg_constructed_mapper : public common_chat_peg_mapper {
common_chat_tool_call * current_tool;
int arg_count = 0;
bool needs_closing_quote = false;

public:
common_chat_peg_constructed_mapper(common_chat_msg & msg) : common_chat_peg_mapper(msg) {}

void map(const common_peg_ast_node & node) override;
};

inline common_peg_arena build_chat_peg_constructed_parser(const std::function<common_peg_parser(common_chat_peg_constructed_builder & builder)> & fn) {
common_chat_peg_constructed_builder builder;
builder.set_root(fn(builder));
return builder.build();
}
Loading
Loading