Clean up

- Clean up constants file: use the unicodedata module instead
frnmst · Apr 5, 2024 · 3dcbc6d · 3dcbc6d
1 parent ab64da0
commit 3dcbc6d
Show file tree

Hide file tree

Showing 3 changed files with 13 additions and 832 deletions.
diff --git a/md_toc/cmark/cmark_ctype_c.py b/md_toc/cmark/cmark_ctype_c.py
@@ -20,6 +20,9 @@
 #
 r"""The cmark implementation file."""
 
+import string
+import unicodedata
+
 from ..constants import parser as md_parser
 
 # License C applies to this file except for non derivative code:
@@ -30,21 +33,17 @@
 # Return True if c is a "whitespace" character as defined by the spec.
 # 0.30
 def _cmark_cmark_isspace(char: int) -> bool:
-    value = False
-    if chr(char) in md_parser['cmark']['pseudo_re']['UWC']:
-        value = True
-
-    return value
+    # A Unicode whitespace character is any code point in the Unicode Zs
+    # general category, or a tab (U+0009), line feed (U+000A), form feed
+    # (U+000C), or carriage return (U+000D).
+    return (unicodedata.category(chr(char)) == 'Zs'
+            or chr(char) in ['\u0009', '\u000A', '\u000C', '\u000D'])
 
 
 # Return True if c is an ascii punctuation character.
 # 0.29, 0.30
 def _cmark_cmark_ispunct(char: int) -> bool:
-    value = False
-    if chr(char) in md_parser['cmark']['pseudo_re']['APC']:
-        value = True
-
-    return value
+    return chr(char) in string.punctuation
 
 
 if __name__ == '__main__':

diff --git a/md_toc/cmark/utf8_c.py b/md_toc/cmark/utf8_c.py
@@ -168,22 +168,15 @@ def _cmark_cmark_utf8proc_case_fold(
 # 0.29, 0.30
 def _cmark_cmark_utf8proc_is_space(char: int) -> bool:
     r"""Match anything in the Zs class, plus LF, CR, TAB, FF."""
-    value: bool = False
-    if chr(char) in md_parser['cmark']['pseudo_re']['UWC']:
-        value = True
-
-    return value
+    return (unicodedata.category(chr(char)) == 'Zs'
+            or chr(char) in ['\u0009', '\u000A', '\u000C', '\u000D'])
 
 
 # 0.29, 0.30
 def _cmark_cmark_utf8proc_is_punctuation(char: int) -> bool:
     r"""Match anything in the P[cdefios] classes."""
-    value: bool = False
-    if ((char < 128 and _cmark_cmark_ispunct(char))
-            or chr(char) in md_parser['cmark']['pseudo_re']['UPC']):
-        value = True
-
-    return value
+    return ((char < 128 and _cmark_cmark_ispunct(char))
+            or unicodedata.category(chr(char)).startswith('P'))
 
 
 if __name__ == '__main__':