Skip to content

Commit

Permalink
Keyword detection: addressing examples 2 and 3 of issue #4712.
Browse files Browse the repository at this point in the history
A contiguous sequence of characters must be fully inside or fully
outside a keyword: no break of a keyword in the middle of a sequence
of contiguous characters.
  • Loading branch information
herbelin committed Jul 15, 2022
1 parent 62591d7 commit d3e57fc
Show file tree
Hide file tree
Showing 5 changed files with 57 additions and 9 deletions.
11 changes: 9 additions & 2 deletions doc/sphinx/language/core/basic.rst
Original file line number Diff line number Diff line change
Expand Up @@ -189,11 +189,18 @@ Other tokens
tokens.

When multiple tokens match the beginning of a sequence of characters,
the longest matching token is used.
the longest matching token not cutting the sequence of characters in the middle of a subsequence forming a valid identifier is used.
Occasionally you may need to insert spaces to separate tokens. For example,
if ``~`` and ``~~`` are both defined as tokens, the inputs ``~ ~`` and
``~~`` generate different tokens, whereas if `~~` is not defined, then the
two inputs are equivalent.
two inputs are equivalent. Also, if ``~`` and ``~_h`` are both
defined as tokens, the input ``~_ho`` is interpreted as ``~ _ho``
rather than ``~_h o`` so as not to cut the identifier-like
subsequence ``ho``. Contrastingly, if only ``~_h`` is defined as a token,
then ``~_ho`` is an error because no token can be found that includes
the whole subsequence ``ho`` without cutting it in the middle. Finally, if
all of ``~``, ``~_h`` and ``~_ho`` are defined as tokens, the input
``~_ho`` is interpreted using the longest match rule, i.e. as the token ``~_ho``.

Essential vocabulary
--------------------
Expand Down
26 changes: 19 additions & 7 deletions parsing/cLexer.ml
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,16 @@ let status_of_utf8 = function
let lookup_utf8 loc cs =
status_of_utf8 (lookup_utf8_char loc 0 cs)

let is_ident_head l =
match status_of_utf8 l with
| EmptyStream -> false
| Utf8Token (st,_) -> Unicode.is_valid_ident_initial st

let is_ident_trailing l =
match status_of_utf8 l with
| EmptyStream -> false
| Utf8Token (st,_) -> Unicode.is_valid_ident_trailing st

let unlocated f x =
let dummy_loc = Loc.(initial ToplevelInput) in
f dummy_loc x
Expand Down Expand Up @@ -487,20 +497,22 @@ let update_longest_valid_token last nj tt cs =

(* try to progress by peeking the next utf-8 lexeme *)
(* and retain the longest valid special token obtained *)
let rec progress_further loc last nj tt cs =
let rec progress_further loc last nj in_ident_part tt cs =
match lookup_utf8_char loc nj cs with
| [] -> last
| l -> progress_utf8 loc last nj tt cs l
| l -> progress_utf8 loc last nj in_ident_part tt cs l

(* under the same assumptions as [update_longest_valid_token], *)
(* read the [n] bytes of the current utf-8 lexeme whose first byte is [c] *)
and progress_utf8 loc last nj tt cs l =
and progress_utf8 loc last nj in_ident_part tt cs l =
let in_ident_part' = if in_ident_part then is_ident_trailing l else is_ident_head l in
(* compute longest match before considering utf8 block [l] *)
let nj, last = update_longest_valid_token last nj tt cs in
(* not allowing update if in the middle of an ident part *)
let nj, last = if in_ident_part && in_ident_part' then nj, last else update_longest_valid_token last nj tt cs in
try
(* descend in tree according to current utf8 block [l] *)
let tt = List.fold_left (fun tt c -> CharMap.find c tt.branch) tt l in
progress_further loc last (nj + List.length l) tt cs
progress_further loc last (nj + List.length l) in_ident_part' tt cs
with Not_found ->
last

Expand Down Expand Up @@ -593,7 +605,7 @@ let find_keyword loc id bp s =
end
else
let tt = ttree_find !token_tree id in
match progress_further loc tt.node 0 tt s with
match progress_further loc tt.node 0 true tt s with
| None -> raise Not_found
| Some (c,NoQuotation) ->
let ep = Stream.count s in
Expand All @@ -612,7 +624,7 @@ let process_sequence loc bp c cs =

(* Must be a special token *)
let process_chars ~diff_mode loc bp l cs =
let t = progress_utf8 loc None (- (List.length l)) !token_tree cs l in
let t = progress_utf8 loc None (- (List.length l)) false !token_tree cs l in
let ep = Stream.count cs in
match t with
| Some (t,NoQuotation) -> (KEYWORD t, set_loc_pos loc bp ep)
Expand Down
14 changes: 14 additions & 0 deletions test-suite/bugs/bug_4712.v
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
Module Example1.

Notation "x =_s y" := (x = y) (at level 50).
Fail Check id=_sid.

End Example1.

Module Example2.
(* see https://coq.zulipchat.com/#narrow/stream/237664-math-comp-users/topic/Surprising.20parsing.20of.20the.20.E2.80.9C*l.E2.80.9D.20notation *)

Notation "*l" := Nat.add.
Check forall (T lT: Type) (x: T*lT), x = x. (* Expected to be "(x : T * lT)" *)

End Example2.
5 changes: 5 additions & 0 deletions test-suite/output/bug_4712_part2.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
File "./output/bug_4712_part2.v", line 8, characters 21-22:
Error: Syntax Error: Lexer: Undefined token


coqc exited with code 1
10 changes: 10 additions & 0 deletions test-suite/output/bug_4712_part2.v
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
Module Example2.

Notation "'foobar'" := 1.
Definition a := foobar.
Definition A := 2.
Fail Definition b := foobarA. (* Good, as expected *)
Notation "'\foobar'" := (fun x => 1 + x).
Fail Definition b := \foobarA.

End Example2.

0 comments on commit d3e57fc

Please sign in to comment.