From e9a4691053e79eeb34d9d838bc130e61cfa0aba2 Mon Sep 17 00:00:00 2001 From: Tony Narlock Date: Sun, 10 Dec 2023 07:05:05 -0600 Subject: [PATCH 1/3] expansion(kRSGeneric): Fix double apostrophe (non-apostrophe, simplified radical) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit via https://www.unicode.org/reports/tr38/#kRSUnicode: > The standard radical-stroke count for this ideograph in the form “radical.additional strokes.” The radical is indicated by a number in the range 1–214, followed by an optional single apostrophe (U+0027 ' apostrophe) or double apostrophe ('') suffix. A single apostrophe after the radical indicates a Chinese simplified version of the given radical. Two apostrophes after the radical indicates a non-Chinese simplified version of the given radical. The “additional strokes” value is the residual stroke-count, the count of all strokes remaining after eliminating all strokes associated with the radical. --- src/unihan_etl/expansion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/unihan_etl/expansion.py b/src/unihan_etl/expansion.py index d8435b6e..6f5dffcd 100644 --- a/src/unihan_etl/expansion.py +++ b/src/unihan_etl/expansion.py @@ -577,7 +577,7 @@ def _expand_kRSGeneric(value: t.List[str]) -> t.List[kRSGenericDict]: pattern = re.compile( r""" (?P[1-9][0-9]{0,2}) - (?P\'?)\. + (?P\'{0,2})\. (?P-?[0-9]{1,2}) """, re.X, From e57bf2279a07616e30242f011a5e9e5640efff48 Mon Sep 17 00:00:00 2001 From: Tony Narlock Date: Sun, 10 Dec 2023 07:10:36 -0600 Subject: [PATCH 2/3] docs(CHANGES): Document kRSGeneric fix --- CHANGES | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGES b/CHANGES index 624cd513..14dcc156 100644 --- a/CHANGES +++ b/CHANGES @@ -19,6 +19,10 @@ $ pipx install --suffix=@next unihan-etl --pip-args '\--pre' --force +### Bug fix + +- Expansions: Fix loading of double apostrophe values via `kRSUnicode` via `kRSGeneric` (#304) + ## unihan-etl 0.30.0post0 (2023-11-26) ### CI From 7a219a49095ed55dc73ce02de663943d8ca4cd77 Mon Sep 17 00:00:00 2001 From: Tony Narlock Date: Sun, 10 Dec 2023 07:18:13 -0600 Subject: [PATCH 3/3] doctests(expansion): Add test for _expand_kRSGeneric() --- src/unihan_etl/expansion.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/unihan_etl/expansion.py b/src/unihan_etl/expansion.py index 6f5dffcd..1d86eacc 100644 --- a/src/unihan_etl/expansion.py +++ b/src/unihan_etl/expansion.py @@ -573,7 +573,14 @@ class kRSGenericDict(t.TypedDict): def _expand_kRSGeneric(value: t.List[str]) -> t.List[kRSGenericDict]: - """Expand kRSGeneric field.""" + """Expand kRSGeneric field. + + Examples + -------- + >>> _expand_kRSGeneric(['5.10', "213''.0"]) # doctest: +NORMALIZE_WHITESPACE + [{'radical': 5, 'strokes': 10, 'simplified': False}, + {'radical': 213, 'strokes': 0, 'simplified': False}] + """ pattern = re.compile( r""" (?P[1-9][0-9]{0,2})