diff --git a/src/dsp_tools/commands/excel2xml/excel2xml_lib.py b/src/dsp_tools/commands/excel2xml/excel2xml_lib.py index ea5bcf0b9..7739d8186 100644 --- a/src/dsp_tools/commands/excel2xml/excel2xml_lib.py +++ b/src/dsp_tools/commands/excel2xml/excel2xml_lib.py @@ -94,19 +94,24 @@ def _find_french_bc_date( def find_date_in_string(string: str) -> Optional[str]: """ - Checks if a string contains a date value (single date, or date range), and returns the first found date as - DSP-formatted string. Returns None if no date was found. + Checks if a string contains a date value (single date, or date range), + and returns the first found date as DSP-formatted string. + Returns None if no date was found. Notes: - All dates are interpreted in the Christian era and the Gregorian calendar. - BC dates are only supported in French notation (e.g. 1000-900 av. J.-C.). - The years 0000-2999 are supported, in 3/4-digit form. - Dates written with slashes are always interpreted in a European manner: 5/11/2021 is the 5th of November. + - In the European notation, 2-digit years are expanded to 4 digits, with the current year as watershed: + - 30.4.24 -> 30.04.2024 + - 30.4.25 -> 30.04.1925 Currently supported date formats: - 0476-09-04 -> GREGORIAN:CE:0476-09-04:CE:0476-09-04 - 0476_09_04 -> GREGORIAN:CE:0476-09-04:CE:0476-09-04 - 30.4.2021 -> GREGORIAN:CE:2021-04-30:CE:2021-04-30 + - 30.4.21 -> GREGORIAN:CE:2021-04-30:CE:2021-04-30 - 5/11/2021 -> GREGORIAN:CE:2021-11-05:CE:2021-11-05 - Jan 26, 1993 -> GREGORIAN:CE:1993-01-26:CE:1993-01-26 - 28.2.-1.12.1515 -> GREGORIAN:CE:1515-02-28:CE:1515-12-01 @@ -175,6 +180,7 @@ def _find_date_in_string_throwing(string: str) -> str | None: This function is the same as find_date_in_string(), but may raise a ValueError instead of returning None. """ year_regex = r"([0-2]?[0-9][0-9][0-9])" + year_regex_2_or_4_digits = r"((?:[0-2]?[0-9])?[0-9][0-9])" month_regex = r"([0-1]?[0-9])" day_regex = r"([0-3]?[0-9])" sep_regex = r"[\./]" @@ -186,25 +192,31 @@ def _find_date_in_string_throwing(string: str) -> str | None: # template: 2021-01-01 | 2015_01_02 iso_date = regex.search(rf"{lookbehind}{year_regex}[_-]([0-1][0-9])[_-]([0-3][0-9]){lookahead}", string) + # template: 6.-8.3.1948 | 6/2/1947 - 24.03.1948 eur_date_range_regex = ( rf"{lookbehind}" - rf"{day_regex}{sep_regex}(?:{month_regex}{sep_regex}{year_regex}?)? ?(?:-|:|to) ?" - rf"{day_regex}{sep_regex}{month_regex}{sep_regex}{year_regex}" + rf"{day_regex}{sep_regex}(?:{month_regex}{sep_regex}{year_regex_2_or_4_digits}?)? ?(?:-|:|to) ?" + rf"{day_regex}{sep_regex}{month_regex}{sep_regex}{year_regex_2_or_4_digits}" rf"{lookahead}" ) eur_date_range = regex.search(eur_date_range_regex, string) + # template: 1.4.2021 | 5/11/2021 + eur_date_regex = rf"{lookbehind}{day_regex}{sep_regex}{month_regex}{sep_regex}{year_regex_2_or_4_digits}{lookahead}" eur_date = regex.search( - rf"{lookbehind}{day_regex}{sep_regex}{month_regex}{sep_regex}{year_regex}{lookahead}", + eur_date_regex, string, ) + # template: March 9, 1908 | March5,1908 | May 11, 1906 all_months = "|".join(_months_dict) monthname_date_regex = rf"{lookbehind}({all_months}) ?{day_regex}, ?{year_regex}{lookahead}" monthname_date = regex.search(monthname_date_regex, string) + # template: 1849/50 | 1849-50 | 1849/1850 year_range = regex.search(lookbehind + year_regex + r"[/-](\d{1,4})" + lookahead, string) + # template: 1907 year_only = regex.search(rf"{lookbehind}{year_regex}{lookahead}", string) @@ -233,13 +245,25 @@ def _from_iso_date(iso_date: Match[str]) -> str: return f"GREGORIAN:CE:{date.isoformat()}:CE:{date.isoformat()}" +def _expand_2_digit_year(year: int) -> int: + current_year = datetime.date.today().year - 2000 + if year <= current_year: + return year + 2000 + elif year <= 99: + return year + 1900 + else: + return year + + def _from_eur_date_range(eur_date_range: Match[str]) -> str: startday = int(eur_date_range.group(1)) startmonth = int(eur_date_range.group(2)) if eur_date_range.group(2) else int(eur_date_range.group(5)) startyear = int(eur_date_range.group(3)) if eur_date_range.group(3) else int(eur_date_range.group(6)) + startyear = _expand_2_digit_year(startyear) endday = int(eur_date_range.group(4)) endmonth = int(eur_date_range.group(5)) endyear = int(eur_date_range.group(6)) + endyear = _expand_2_digit_year(endyear) startdate = datetime.date(startyear, startmonth, startday) enddate = datetime.date(endyear, endmonth, endday) if enddate <= startdate: @@ -251,6 +275,7 @@ def _from_eur_date(eur_date: Match[str]) -> str: startday = int(eur_date.group(1)) startmonth = int(eur_date.group(2)) startyear = int(eur_date.group(3)) + startyear = _expand_2_digit_year(startyear) date = datetime.date(startyear, startmonth, startday) return f"GREGORIAN:CE:{date.isoformat()}:CE:{date.isoformat()}" diff --git a/test/unittests/commands/excel2xml/test_excel2xml_lib.py b/test/unittests/commands/excel2xml/test_excel2xml_lib.py index 464eb988a..166a58699 100644 --- a/test/unittests/commands/excel2xml/test_excel2xml_lib.py +++ b/test/unittests/commands/excel2xml/test_excel2xml_lib.py @@ -1,3 +1,4 @@ +import datetime import unittest import warnings from pathlib import Path @@ -204,6 +205,13 @@ def test_find_date_in_string_eur_date(self) -> None: assert excel2xml.find_date_in_string("x -2193_01_26- x") == "GREGORIAN:CE:2193-01-26:CE:2193-01-26" assert excel2xml.find_date_in_string("x 2193_02_30 x") is None + def test_find_date_in_string_eur_date_2_digit(self) -> None: + cur = str(datetime.date.today().year - 2000) # in 2024, this will be "24" + nxt = str(datetime.date.today().year - 2000 + 1) # in 2024, this will be "25" + assert excel2xml.find_date_in_string(f"x 30.4.{cur} x") == f"GREGORIAN:CE:20{cur}-04-30:CE:20{cur}-04-30" + assert excel2xml.find_date_in_string(f"x 30.4.{nxt} x") == f"GREGORIAN:CE:19{nxt}-04-30:CE:19{nxt}-04-30" + assert excel2xml.find_date_in_string(f"x 31.4.{nxt} x") is None + def test_find_date_in_string_eur_date_range(self) -> None: """template: 27.-28.1.1900""" assert excel2xml.find_date_in_string("x 25.-26.2.0800 x") == "GREGORIAN:CE:0800-02-25:CE:0800-02-26" @@ -211,6 +219,12 @@ def test_find_date_in_string_eur_date_range(self) -> None: assert excel2xml.find_date_in_string("x 25.-24.2.0800 x") is None assert excel2xml.find_date_in_string("x 25.-25.2.0800 x") is None + def test_find_date_in_string_eur_date_range_2_digit(self) -> None: + cur = str(datetime.date.today().year - 2000) # in 2024, this will be "24" + nxt = str(datetime.date.today().year - 2000 + 1) # in 2024, this will be "25" + assert excel2xml.find_date_in_string(f"x 15.-16.4.{cur} x") == f"GREGORIAN:CE:20{cur}-04-15:CE:20{cur}-04-16" + assert excel2xml.find_date_in_string(f"x 15.-16.4.{nxt} x") == f"GREGORIAN:CE:19{nxt}-04-15:CE:19{nxt}-04-16" + def test_find_date_in_string_eur_date_range_across_month(self) -> None: """template: 26.2.-24.3.1948""" assert excel2xml.find_date_in_string("x _1.3. - 25.4.2022_ x") == "GREGORIAN:CE:2022-03-01:CE:2022-04-25" @@ -220,6 +234,12 @@ def test_find_date_in_string_eur_date_range_across_month(self) -> None: assert excel2xml.find_date_in_string("x 28.2.-26.2.1515 x") is None assert excel2xml.find_date_in_string("x 28.2.-28.2.1515 x") is None + def test_find_date_in_string_eur_date_range_across_month_2_digit(self) -> None: + cur = str(datetime.date.today().year - 2000) # in 2024, this will be "24" + nxt = str(datetime.date.today().year - 2000 + 1) # in 2024, this will be "25" + assert excel2xml.find_date_in_string(f"x 15.04.-1.5.{cur} x") == f"GREGORIAN:CE:20{cur}-04-15:CE:20{cur}-05-01" + assert excel2xml.find_date_in_string(f"x 15.04.-1.5.{nxt} x") == f"GREGORIAN:CE:19{nxt}-04-15:CE:19{nxt}-05-01" + def test_find_date_in_string_eur_date_range_across_year(self) -> None: """template: 1.12.1973 - 6.1.1974""" assert excel2xml.find_date_in_string("x 1.9.2022-3.1.2024 x") == "GREGORIAN:CE:2022-09-01:CE:2024-01-03" @@ -231,6 +251,12 @@ def test_find_date_in_string_eur_date_range_across_year(self) -> None: assert excel2xml.find_date_in_string("x 25/12/2022-03/01/2022 x") is None assert excel2xml.find_date_in_string("x 25/12/2022-25/12/2022 x") is None + def test_find_date_in_string_eur_date_range_across_year_2_digit(self) -> None: + cur = str(datetime.date.today().year - 2000) # in 2024, this will be "24" + nxt = str(datetime.date.today().year - 2000 + 1) # in 2024, this will be "25" + assert excel2xml.find_date_in_string(f"x 15.04.23-1.5.{cur} x") == f"GREGORIAN:CE:2023-04-15:CE:20{cur}-05-01" + assert excel2xml.find_date_in_string(f"x 15.04.{nxt}-1.5.26 x") == f"GREGORIAN:CE:19{nxt}-04-15:CE:1926-05-01" + def test_find_date_in_string_monthname(self) -> None: """template: February 9, 1908 | Dec 5,1908""" assert excel2xml.find_date_in_string("x Jan 26, 1993 x") == "GREGORIAN:CE:1993-01-26:CE:1993-01-26"