From 5f48a6e19fb083a91d941d568370dd65ed500868 Mon Sep 17 00:00:00 2001 From: Jens Maurer Date: Wed, 27 Jul 2022 01:01:26 +0200 Subject: [PATCH 1/2] P2071R2 Named universal character escapes --- source/lex.tex | 144 +++++++++++++++++++++++++++++++++++++++- source/preprocessor.tex | 1 + 2 files changed, 143 insertions(+), 2 deletions(-) diff --git a/source/lex.tex b/source/lex.tex index 7fb88c4f94..2c7fed4d5b 100644 --- a/source/lex.tex +++ b/source/lex.tex @@ -288,6 +288,25 @@ The \grammarterm{universal-character-name} construct provides a way to name other characters. +\begin{bnf} +\nontermdef{n-char} \textnormal{one of}\br + \terminal{A B C D E F G H I J K L M N O P Q R S T U V W X Y Z}\br + \terminal{0 1 2 3 4 5 6 7 8 9}\br + \textnormal{\unicode{002d}{hyphen-minus}}\br + \textnormal{\unicode{0020}{space}} +\end{bnf} + +\begin{bnf} +\nontermdef{n-char-sequence}\br + n-char\br + n-char-sequence n-char +\end{bnf} + +\begin{bnf} +\nontermdef{named-universal-character}\br + \terminal{\textbackslash N\{} n-char-sequence \terminal{\}} +\end{bnf} + \begin{bnf} \nontermdef{hex-quad}\br hexadecimal-digit hexadecimal-digit hexadecimal-digit hexadecimal-digit @@ -303,15 +322,136 @@ \nontermdef{universal-character-name}\br \terminal{\textbackslash u} hex-quad\br \terminal{\textbackslash U} hex-quad hex-quad\br - \terminal{\textbackslash u\{} simple-hexadecimal-digit-sequence \terminal{\}} + \terminal{\textbackslash u\{} simple-hexadecimal-digit-sequence \terminal{\}}\br + named-universal-character \end{bnf} +\pnum A \grammarterm{universal-character-name} +of the form \tcode{\textbackslash u} \grammarterm{hex-quad} or +\tcode{\textbackslash U} \grammarterm{hex-quad} \grammarterm{hex-quad} designates the character in the translation character set whose UCS scalar value is the hexadecimal number represented by the sequence of \grammarterm{hexadecimal-digit}s in the \grammarterm{universal-character-name}. The program is ill-formed if that number is not a UCS scalar value. + +\pnum +A \grammarterm{universal-character-name} +that is a \grammarterm{named-universal-character} +designates the character named by its \grammarterm{n-char-sequence}. +A character is so named if the \grammarterm{n-char-sequence} is equal to +\begin{itemize} +\item +the associated character name or associated character name alias +specified in ISO/IEC 10646 subclause ``Code charts and lists of character names'' +or +\item +the control code alias given in \tref{lex.charset.ucn}. +\begin{note} +The aliases in \tref{lex.charset.ucn} are provided for control characters +which otherwise have no associated character name or character name alias. +These names are derived from +the Unicode Character Database's \tcode{NameAliases.txt}. +For historical reasons, control characters are formally unnamed. +\end{note} +\end{itemize} +\begin{note} +None of the associated character names, +associated character name aliases, or +control code aliases +have leading or trailing spaces. +\end{note} + +\begin{multicolfloattable}{Control code aliases}{lex.charset.ucn}{ll} +\unicode{0000}{null} \\ +\unicode{0001}{start of heading} \\ +\unicode{0002}{start of text} \\ +\unicode{0003}{end of text} \\ +\unicode{0004}{end of transmission} \\ +\unicode{0005}{enquiry} \\ +\unicode{0006}{acknowledge} \\ +\unicode{0007}{alert} \\ +\unicode{0008}{backspace} \\ +\unicode{0009}{character tabulation} \\ +\unicode{0009}{horizontal tabulation} \\ +\unicode{000a}{line feed} \\ +\unicode{000a}{new line} \\ +\unicode{000a}{end of line} \\ +\unicode{000b}{line tabulation} \\ +\unicode{000b}{vertical tabulation} \\ +\unicode{000c}{form feed} \\ +\unicode{000d}{carriage return} \\ +\unicode{000e}{shift out} \\ +\unicode{000e}{locking-shift one} \\ +\unicode{000f}{shift in} \\ +\unicode{000f}{locking-shift zero} \\ +\unicode{0010}{data link escape} \\ +\unicode{0011}{device control one} \\ +\unicode{0012}{device control two} \\ +\unicode{0013}{device control three} \\ +\unicode{0014}{device control four} \\ +\unicode{0015}{negative acknowledge} \\ +\unicode{0016}{synchronous idle} \\ +\unicode{0017}{end of transmission block} \\ +\unicode{0018}{cancel} \\ +\unicode{0019}{end of medium} \\ +\unicode{001a}{substitute} \\ +\unicode{001b}{escape} \\ +\unicode{001c}{information separator four} \\ +\unicode{001c}{file separator} \\ +\unicode{001d}{information separator three} \\ +\unicode{001d}{group separator} \\ +\unicode{001e}{information separator two} \\ +\unicode{001e}{record separator} \\ +\unicode{001f}{information separator one} \\ +\unicode{001f}{unit separator} \\ +\columnbreak +\unicode{007f}{delete} \\ +\unicode{0082}{break permitted here} \\ +\unicode{0083}{no break here} \\ +\unicode{0084}{index} \\ +\unicode{0085}{next line} \\ +\unicode{0086}{start of selected area} \\ +\unicode{0087}{end of selected area} \\ +\unicode{0088}{character tabulation set} \\ +\unicode{0088}{horizontal tabulation set} \\ +\unicode{0089}{character tabulation with justification} \\ +\unicode{0089}{horizontal tabulation with justification} \\ +\unicode{008a}{line tabulation set} \\ +\unicode{008a}{vertical tabulation set} \\ +\unicode{008b}{partial line forward} \\ +\unicode{008b}{partial line down} \\ +\unicode{008c}{partial line backward} \\ +\unicode{008c}{partial line up} \\ +\unicode{008d}{reverse line feed} \\ +\unicode{008d}{reverse index} \\ +\unicode{008e}{single shift two} \\ +\unicode{008e}{single shift-2} \\ +\unicode{008f}{single shift three} \\ +\unicode{008f}{single shift-3} \\ +\unicode{0090}{device control string} \\ +\unicode{0091}{private use one} \\ +\unicode{0091}{private use-1} \\ +\unicode{0092}{private use two} \\ +\unicode{0092}{private use-2} \\ +\unicode{0093}{set transmit state} \\ +\unicode{0094}{cancel character} \\ +\unicode{0095}{message waiting} \\ +\unicode{0096}{start of guarded area} \\ +\unicode{0096}{start of protected area} \\ +\unicode{0097}{end of guarded area} \\ +\unicode{0097}{end of protected area} \\ +\unicode{0098}{start of string} \\ +\unicode{009a}{single character introducer} \\ +\unicode{009b}{control sequence introducer} \\ +\unicode{009c}{string terminator} \\ +\unicode{009d}{operating system command} \\ +\unicode{009e}{privacy message} \\ +\unicode{009f}{application program command} \\ +\end{multicolfloattable} + +\pnum If a \grammarterm{universal-character-name} outside the \grammarterm{c-char-sequence}, \grammarterm{s-char-sequence}, or \grammarterm{r-char-sequence} of @@ -1345,7 +1485,7 @@ \begin{bnf} \nontermdef{conditional-escape-sequence-char}\br - \textnormal{any member of the basic character set that is not an} octal-digit\textnormal{, a} simple-escape-sequence-char\textnormal{, or the characters \terminal{o}, \terminal{u}, \terminal{U}, or \terminal{x}} + \textnormal{any member of the basic character set that is not an} octal-digit\textnormal{, a} simple-escape-sequence-char\textnormal{, or the characters \terminal{N}, \terminal{o}, \terminal{u}, \terminal{U}, or \terminal{x}} \end{bnf} \pnum diff --git a/source/preprocessor.tex b/source/preprocessor.tex index 775055f57e..190f7dc1eb 100644 --- a/source/preprocessor.tex +++ b/source/preprocessor.tex @@ -1789,6 +1789,7 @@ \defnxname{cpp_lambdas} & \tcode{200907L} \\ \rowsep \defnxname{cpp_modules} & \tcode{201907L} \\ \rowsep \defnxname{cpp_multidimensional_subscript} & \tcode{202110L} \\ \rowsep +\defnxname{cpp_named_character_escapes} & \tcode{202207L} \\ \rowsep \defnxname{cpp_namespace_attributes} & \tcode{201411L} \\ \rowsep \defnxname{cpp_noexcept_function_type} & \tcode{201510L} \\ \rowsep \defnxname{cpp_nontype_template_args} & \tcode{201911L} \\ \rowsep From f9cb3dbcc6cf10632aa453b179d927d1b6be459d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20K=C3=B6ppe?= Date: Fri, 5 Aug 2022 23:07:37 +0100 Subject: [PATCH 2/2] [lex.charset] Add delimited form \u{...} of universal character name The adoption of P2290R3 "Delimited escape sequences" via CWG Motion 8 added another form of universal character name, which we now need to list as well. --- source/lex.tex | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/source/lex.tex b/source/lex.tex index 2c7fed4d5b..afc0772746 100644 --- a/source/lex.tex +++ b/source/lex.tex @@ -328,8 +328,9 @@ \pnum A \grammarterm{universal-character-name} -of the form \tcode{\textbackslash u} \grammarterm{hex-quad} or -\tcode{\textbackslash U} \grammarterm{hex-quad} \grammarterm{hex-quad} +of the form \tcode{\textbackslash u} \grammarterm{hex-quad}, +\tcode{\textbackslash U} \grammarterm{hex-quad} \grammarterm{hex-quad}, or +\tcode{\textbackslash u\{\grammarterm{simple-hexadecimal-digit-sequence}\}} designates the character in the translation character set whose UCS scalar value is the hexadecimal number represented by the sequence of \grammarterm{hexadecimal-digit}s