Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement C23 identifiers via UAX31 (minus normalization) #15307

Merged
merged 1 commit into from Mar 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
12 changes: 12 additions & 0 deletions changelog/dmd.identifier-tables.dd
@@ -0,0 +1,12 @@
Expansion of identifier tables to allow new characters to match C23 have been added along with CLI configurability

You can currently choose between ``c99``, ``c11``, ``UAX31`` (C23's) and ``all`` (the least restrictive set) for both D and ImportC.

This can be done with ``-identifiers=<table>`` and for ImportC ``-identifiers-importc=<table>``.

The default table for D is currently set to ``all``, while ImportC is set to ``c11``.
Previously both D and ImportC used the ``c99`` tables.

D's table will be swapped over at a later date to [UAX31](https://unicode.org/reports/tr31/), this should be done in 2.117.
If you find yourself at this time using ``c99`` specific characters and not willing to change them, you may switch back to ``all``.
Although it should be unlikely that you will need to.
6 changes: 6 additions & 0 deletions changelog/dmd.importc-unicode.dd
@@ -0,0 +1,6 @@
ImportC has improved Unicode support

Universal Character Names are now supported, allowing you to use the ``\uXXXX`` and ``\UXXXXXXXX`` syntax where ``X`` is a hex digit as part of an identifier.

DigitalMars sppn does not support anything newer than C99.
It is known to be limited and using any Unicode character not in those ranges will result in an error.
2 changes: 1 addition & 1 deletion compiler/src/build.d
Expand Up @@ -1584,7 +1584,7 @@ auto sourceFiles()
stringtable.d utf.d
"),
common: fileArray(env["COMMON"], "
bitfields.d file.d int128.d blake3.d outbuffer.d smallbuffer.d
bitfields.d file.d int128.d blake3.d outbuffer.d smallbuffer.d charactertables.d identifiertables.d
"),
commonHeaders: fileArray(env["COMMON"], "
outbuffer.h
Expand Down
20 changes: 20 additions & 0 deletions compiler/src/dmd/cli.d
Expand Up @@ -466,6 +466,26 @@ dmd -cov -unittest myprog.d

$(P Note that multiple `-i=...` options are allowed, each one adds a pattern.)}"
),
Option("identifiers=<table>",
"Specify the non-ASCII tables for D identifiers",
`Set the identifier table to use for the non-ASCII values.
$(UL
$(LI $(I UAX31): UAX31)
$(LI $(I c99): C99)
$(LI $(I c11): C11)
$(LI $(I all): All, the least restrictive set, which comes all others (default))
)`
),
Option("identifiers-importc=<table>",
"Specify the non-ASCII tables for ImportC identifiers",
`Set the identifier table to use for the non-ASCII values.
$(UL
$(LI $(I UAX31): UAX31)
$(LI $(I c99): C99)
$(LI $(I c11): C11 (default))
$(LI $(I all): All, the least restrictive set, which comes all others)
)`
),
Option("ignore",
"deprecated flag, unsupported pragmas are always ignored now"
),
Expand Down
267 changes: 267 additions & 0 deletions compiler/src/dmd/common/charactertables.d
@@ -0,0 +1,267 @@
/**
* Character tables related to identifiers.
*
* Supports UAX31, C99, C11 and least restrictive (All).
*
* Copyright: Copyright (C) 1999-2024 by The D Language Foundation, All Rights Reserved
* Authors: $(LINK2 https://cattermole.co.nz, Richard (Rikki) Andrew Cattermole)
* License: $(LINK2 https://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
* Source: $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/common/charactertables.d, common/charactertables.d)
* Documentation: https://dlang.org/phobos/dmd_common_charactertables.html
* Coverage: https://codecov.io/gh/dlang/dmd/src/master/src/dmd/common/charactertables.d
*/
module dmd.common.charactertables;

@safe nothrow @nogc pure:

extern(C++):

///
enum IdentifierTable {
UAX31, ///
C99, ///
C11, ///
LR, /// Least Restrictive aka All
}

///
struct IdentifierCharLookup
{
@safe nothrow @nogc pure:

///
extern(C++) bool function(dchar) isStart;
///
extern(C++) bool function(dchar) isContinue;

/// Lookup the table given the table name
static IdentifierCharLookup forTable(IdentifierTable table)
{
import dmd.common.identifiertables;

// Awful solution to require these lambdas.
// However without them the extern(C++) ABI issues crop up for isInRange,
// and then it can't access the tables.
final switch(table) {
case IdentifierTable.UAX31:
return IdentifierCharLookup(
(c) => isInRange!UAX31_Start(c),
(c) => isInRange!UAX31_Continue(c));
case IdentifierTable.C99:
return IdentifierCharLookup(
(c) => isInRange!FixedTable_C99_Start(c),
(c) => isInRange!FixedTable_C99_Continue(c));
case IdentifierTable.C11:
return IdentifierCharLookup(
(c) => isInRange!FixedTable_C11_Start(c),
(c) => isInRange!FixedTable_C11_Continue(c));
case IdentifierTable.LR:
return IdentifierCharLookup(
(c) => isInRange!LeastRestrictive_Start(c),
(c) => isInRange!LeastRestrictive_Continue(c));
}
}
}

/**
Convenience function for use in places where we just don't care,
what the identifier ranges are, or if it is start/continue.

Returns: is character a member of least restrictive of all.
*/
bool isAnyIdentifierCharacter(dchar c)
{
import dmd.common.identifiertables;
return isInRange!LeastRestrictive_OfAll(c);
}

///
unittest
{
assert(isAnyContinue('ğ'));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you wanted to test isAnyIdentifierCharacter here.
isAnyContinue is defined and tested later

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indeed.

}

/**
Convenience function for use in places where we just don't care,
what the identifier ranges are.

Returns: is character a member of restrictive Start
*/
bool isAnyStart(dchar c)
{
import dmd.common.identifiertables;
return isInRange!LeastRestrictive_Start(c);
}

///
unittest
{
assert(isAnyStart('ğ'));
}

/**
Convenience function for use in places where we just don't care,
what the identifier ranges are.

Returns: is character a member of least restrictive Continue
*/
bool isAnyContinue(dchar c)
{
import dmd.common.identifiertables;
return isInRange!LeastRestrictive_Continue(c);
}

///
unittest
{
assert(isAnyContinue('ğ'));
}

/// UTF line separator
enum LS = 0x2028;
/// UTF paragraph separator
enum PS = 0x2029;

private
{
enum CMoctal = 0x1;
enum CMhex = 0x2;
enum CMidchar = 0x4;
enum CMzerosecond = 0x8;
enum CMdigitsecond = 0x10;
enum CMsinglechar = 0x20;
}

///
bool isoctal(const char c)
{
return (cmtable[c] & CMoctal) != 0;
}

///
bool ishex(const char c)
{
return (cmtable[c] & CMhex) != 0;
}

///
bool isidchar(const char c)
{
return (cmtable[c] & CMidchar) != 0;
}

///
bool isZeroSecond(const char c)
{
return (cmtable[c] & CMzerosecond) != 0;
}

///
bool isDigitSecond(const char c)
{
return (cmtable[c] & CMdigitsecond) != 0;
}

///
bool issinglechar(const char c)
{
return (cmtable[c] & CMsinglechar) != 0;
}

///
bool c_isxdigit(const int c)
{
return (( c >= '0' && c <= '9') ||
( c >= 'a' && c <= 'f') ||
( c >= 'A' && c <= 'F'));
}

///
bool c_isalnum(const int c)
{
return (( c >= '0' && c <= '9') ||
( c >= 'a' && c <= 'z') ||
( c >= 'A' && c <= 'Z'));
}

extern(D) private:

// originally from dmd.root.utf
bool isInRange(alias Ranges)(dchar c)
{
size_t high = Ranges.length - 1;
// Shortcut search if c is out of range
size_t low = (c < Ranges[0][0] || Ranges[high][1] < c) ? high + 1 : 0;
// Binary search
while (low <= high)
{
const size_t mid = low + ((high - low) >> 1);
if (c < Ranges[mid][0])
high = mid - 1;
else if (Ranges[mid][1] < c)
low = mid + 1;
else
{
assert(Ranges[mid][0] <= c && c <= Ranges[mid][1]);
return true;
}
}
return false;
}

/********************************************
* Do our own char maps
*/
// originally from dmd.lexer (was private)
static immutable cmtable = ()
{
ubyte[256] table;
foreach (const c; 0 .. table.length)
{
if ('0' <= c && c <= '7')
table[c] |= CMoctal;
if (c_isxdigit(c))
table[c] |= CMhex;
if (c_isalnum(c) || c == '_')
table[c] |= CMidchar;

switch (c)
{
case 'x': case 'X':
case 'b': case 'B':
table[c] |= CMzerosecond;
break;

case '0': .. case '9':
case 'e': case 'E':
case 'f': case 'F':
case 'l': case 'L':
case 'p': case 'P':
case 'u': case 'U':
case 'i':
case '.':
case '_':
table[c] |= CMzerosecond | CMdigitsecond;
break;

default:
break;
}

switch (c)
{
case '\\':
case '\n':
case '\r':
case 0:
case 0x1A:
case '\'':
break;
default:
if (!(c & 0x80))
table[c] |= CMsinglechar;
break;
}
}
return table;
}();
20 changes: 20 additions & 0 deletions compiler/src/dmd/common/charactertables.h
@@ -0,0 +1,20 @@
/**
* Character tables related to identifiers.
*
* Supports UAX31, C99, C11 and least restrictive (All).
*
* Copyright: Copyright (C) 1999-2024 by The D Language Foundation, All Rights Reserved
* Authors: $(LINK2 https://cattermole.co.nz, Richard (Rikki) Andrew Cattermole)
* License: $(LINK2 https://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
* Source: $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/common/charactertables.d, common/charactertables.d)
*/

#pragma once

struct IdentifierCharLookup final
{
bool(*isStart)(char32_t);
bool(*isContinue)(char32_t);

// constructor not provided here.
};