Skip to content

Commit

Permalink
isspace: don't include <noBreak> characters.
Browse files Browse the repository at this point in the history
  • Loading branch information
rhdunn committed Feb 12, 2017
1 parent 5f9dc11 commit f109bb9
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 2 deletions.
10 changes: 9 additions & 1 deletion src/ctype.c
Expand Up @@ -140,10 +140,18 @@ int ucd_isspace(codepoint_t c)
{
case UCD_CATEGORY_Zl:
case UCD_CATEGORY_Zp:
return 1;
case UCD_CATEGORY_Zs:
switch (c) // Exclude characters with the <noBreak> DispositionType
{
case 0x00A0: // U+00A0 : NO-BREAK SPACE
case 0x2007: // U+2007 : FIGURE SPACE
case 0x202F: // U+202F : NARROW NO-BREAK SPACE
return 0;
}
return 1;
case UCD_CATEGORY_Cc:
switch (c) // Some control characters are also whitespace characters:
switch (c) // Include control characters marked as White_Space
{
case 0x09: // U+0009 : CHARACTER TABULATION
case 0x0A: // U+000A : LINE FEED
Expand Down
6 changes: 5 additions & 1 deletion tools/printdata.py
Expand Up @@ -50,7 +50,11 @@ def isxdigit(data):
return 1 if data['CodePoint'].char() in '0123456789ABCDEFabcdef' else 0

def isspace(data):
return data.get('White_Space', 0)
if data.get('White_Space', 0):
dt = data.get('DecompositionType', '')
return 1 if dt == None or not dt.startswith('<noBreak>') else 0
else:
return 0

def isupper(data):
if data.get('LowerCase', null) != null:
Expand Down

0 comments on commit f109bb9

Please sign in to comment.