Keyword detection: addressing examples 2 and 3 of issue #4712.

A contiguous sequence of characters must be fully inside or fully outside a keyword: no break of a keyword in the middle of a sequence of contiguous characters.
coq · Jul 15, 2022 · d3e57fc · d3e57fc
1 parent 62591d7
commit d3e57fc
Show file tree

Hide file tree

Showing 5 changed files with 57 additions and 9 deletions.
diff --git a/doc/sphinx/language/core/basic.rst b/doc/sphinx/language/core/basic.rst
@@ -189,11 +189,18 @@ Other tokens
   tokens.
 
   When multiple tokens match the beginning of a sequence of characters,
-  the longest matching token is used.
+  the longest matching token not cutting the sequence of characters in the middle of a subsequence forming a valid identifier is used.
   Occasionally you may need to insert spaces to separate tokens.  For example,
   if ``~`` and ``~~`` are both defined as tokens, the inputs ``~ ~`` and
   ``~~`` generate different tokens, whereas if `~~` is not defined, then the
-  two inputs are equivalent.
+  two inputs are equivalent. Also, if ``~`` and ``~_h`` are both
+  defined as tokens, the input ``~_ho`` is interpreted as ``~ _ho``
+  rather than ``~_h o`` so as not to cut the identifier-like
+  subsequence ``ho``. Contrastingly, if only ``~_h`` is defined as a token,
+  then ``~_ho`` is an error because no token can be found that includes
+  the whole subsequence ``ho`` without cutting it in the middle. Finally, if
+  all of ``~``, ``~_h`` and ``~_ho`` are defined as tokens, the input
+  ``~_ho`` is interpreted using the longest match rule, i.e. as the token ``~_ho``.
 
 Essential vocabulary
 --------------------

diff --git a/parsing/cLexer.ml b/parsing/cLexer.ml
@@ -219,6 +219,16 @@ let status_of_utf8 = function
 let lookup_utf8 loc cs =
   status_of_utf8 (lookup_utf8_char loc 0 cs)
 
+let is_ident_head l =
+  match status_of_utf8 l with
+  | EmptyStream -> false
+  | Utf8Token (st,_) -> Unicode.is_valid_ident_initial st
+
+let is_ident_trailing l =
+  match status_of_utf8 l with
+  | EmptyStream -> false
+  | Utf8Token (st,_) -> Unicode.is_valid_ident_trailing st
+
 let unlocated f x =
   let dummy_loc = Loc.(initial ToplevelInput) in
   f dummy_loc x
@@ -487,20 +497,22 @@ let update_longest_valid_token last nj tt cs =
 
 (* try to progress by peeking the next utf-8 lexeme *)
 (* and retain the longest valid special token obtained *)
-let rec progress_further loc last nj tt cs =
+let rec progress_further loc last nj in_ident_part tt cs =
   match lookup_utf8_char loc nj cs with
   | [] -> last
-  | l -> progress_utf8 loc last nj tt cs l
+  | l -> progress_utf8 loc last nj in_ident_part tt cs l
 
 (* under the same assumptions as [update_longest_valid_token], *)
 (* read the [n] bytes of the current utf-8 lexeme whose first byte is [c] *)
-and progress_utf8 loc last nj tt cs l =
+and progress_utf8 loc last nj in_ident_part tt cs l =
+  let in_ident_part' = if in_ident_part then is_ident_trailing l else is_ident_head l in
   (* compute longest match before considering utf8 block [l] *)
-  let nj, last = update_longest_valid_token last nj tt cs in
+  (* not allowing update if in the middle of an ident part *)
+  let nj, last = if in_ident_part && in_ident_part' then nj, last else update_longest_valid_token last nj tt cs in
   try
     (* descend in tree according to current utf8 block [l] *)
     let tt = List.fold_left (fun tt c -> CharMap.find c tt.branch) tt l in
-    progress_further loc last (nj + List.length l) tt cs
+    progress_further loc last (nj + List.length l) in_ident_part' tt cs
   with Not_found ->
     last
 
@@ -593,7 +605,7 @@ let find_keyword loc id bp s =
     end
   else
   let tt = ttree_find !token_tree id in
-  match progress_further loc tt.node 0 tt s with
+  match progress_further loc tt.node 0 true tt s with
   | None -> raise Not_found
   | Some (c,NoQuotation) ->
       let ep = Stream.count s in
@@ -612,7 +624,7 @@ let process_sequence loc bp c cs =
 
 (* Must be a special token *)
 let process_chars ~diff_mode loc bp l cs =
-  let t = progress_utf8 loc None (- (List.length l)) !token_tree cs l in
+  let t = progress_utf8 loc None (- (List.length l)) false !token_tree cs l in
   let ep = Stream.count cs in
   match t with
     | Some (t,NoQuotation) -> (KEYWORD t, set_loc_pos loc bp ep)

diff --git a/test-suite/bugs/bug_4712.v b/test-suite/bugs/bug_4712.v
@@ -0,0 +1,14 @@
+Module Example1.
+
+Notation "x =_s y" := (x = y) (at level 50).
+Fail Check id=_sid.
+
+End Example1.
+
+Module Example2.
+(* see https://coq.zulipchat.com/#narrow/stream/237664-math-comp-users/topic/Surprising.20parsing.20of.20the.20.E2.80.9C*l.E2.80.9D.20notation *)
+
+Notation "*l" := Nat.add.
+Check forall (T lT: Type) (x: T*lT), x = x. (* Expected to be "(x : T * lT)" *)
+
+End Example2.
diff --git a/test-suite/output/bug_4712_part2.out b/test-suite/output/bug_4712_part2.out
@@ -0,0 +1,5 @@
+File "./output/bug_4712_part2.v", line 8, characters 21-22:
+Error: Syntax Error: Lexer: Undefined token
+
+
+coqc exited with code 1
diff --git a/test-suite/output/bug_4712_part2.v b/test-suite/output/bug_4712_part2.v
@@ -0,0 +1,10 @@
+Module Example2.
+
+Notation "'foobar'" := 1.
+Definition a := foobar.
+Definition A := 2.
+Fail Definition b := foobarA. (* Good, as expected *)
+Notation "'\foobar'" := (fun x => 1 + x).
+Fail Definition b := \foobarA.
+
+End Example2.