Skip to content

Commit

Permalink
feat(excel2xml): support 2-digit-years in find_date_in_string() (DEV-…
Browse files Browse the repository at this point in the history
  • Loading branch information
jnussbaum committed Jan 4, 2024
1 parent 43cba62 commit 15d3493
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 5 deletions.
35 changes: 30 additions & 5 deletions src/dsp_tools/commands/excel2xml/excel2xml_lib.py
Expand Up @@ -94,19 +94,24 @@ def _find_french_bc_date(

def find_date_in_string(string: str) -> Optional[str]:
"""
Checks if a string contains a date value (single date, or date range), and returns the first found date as
DSP-formatted string. Returns None if no date was found.
Checks if a string contains a date value (single date, or date range),
and returns the first found date as DSP-formatted string.
Returns None if no date was found.
Notes:
- All dates are interpreted in the Christian era and the Gregorian calendar.
- BC dates are only supported in French notation (e.g. 1000-900 av. J.-C.).
- The years 0000-2999 are supported, in 3/4-digit form.
- Dates written with slashes are always interpreted in a European manner: 5/11/2021 is the 5th of November.
- In the European notation, 2-digit years are expanded to 4 digits, with the current year as watershed:
- 30.4.24 -> 30.04.2024
- 30.4.25 -> 30.04.1925
Currently supported date formats:
- 0476-09-04 -> GREGORIAN:CE:0476-09-04:CE:0476-09-04
- 0476_09_04 -> GREGORIAN:CE:0476-09-04:CE:0476-09-04
- 30.4.2021 -> GREGORIAN:CE:2021-04-30:CE:2021-04-30
- 30.4.21 -> GREGORIAN:CE:2021-04-30:CE:2021-04-30
- 5/11/2021 -> GREGORIAN:CE:2021-11-05:CE:2021-11-05
- Jan 26, 1993 -> GREGORIAN:CE:1993-01-26:CE:1993-01-26
- 28.2.-1.12.1515 -> GREGORIAN:CE:1515-02-28:CE:1515-12-01
Expand Down Expand Up @@ -175,6 +180,7 @@ def _find_date_in_string_throwing(string: str) -> str | None:
This function is the same as find_date_in_string(), but may raise a ValueError instead of returning None.
"""
year_regex = r"([0-2]?[0-9][0-9][0-9])"
year_regex_2_or_4_digits = r"((?:[0-2]?[0-9])?[0-9][0-9])"
month_regex = r"([0-1]?[0-9])"
day_regex = r"([0-3]?[0-9])"
sep_regex = r"[\./]"
Expand All @@ -186,25 +192,31 @@ def _find_date_in_string_throwing(string: str) -> str | None:

# template: 2021-01-01 | 2015_01_02
iso_date = regex.search(rf"{lookbehind}{year_regex}[_-]([0-1][0-9])[_-]([0-3][0-9]){lookahead}", string)

# template: 6.-8.3.1948 | 6/2/1947 - 24.03.1948
eur_date_range_regex = (
rf"{lookbehind}"
rf"{day_regex}{sep_regex}(?:{month_regex}{sep_regex}{year_regex}?)? ?(?:-|:|to) ?"
rf"{day_regex}{sep_regex}{month_regex}{sep_regex}{year_regex}"
rf"{day_regex}{sep_regex}(?:{month_regex}{sep_regex}{year_regex_2_or_4_digits}?)? ?(?:-|:|to) ?"
rf"{day_regex}{sep_regex}{month_regex}{sep_regex}{year_regex_2_or_4_digits}"
rf"{lookahead}"
)
eur_date_range = regex.search(eur_date_range_regex, string)

# template: 1.4.2021 | 5/11/2021
eur_date_regex = rf"{lookbehind}{day_regex}{sep_regex}{month_regex}{sep_regex}{year_regex_2_or_4_digits}{lookahead}"
eur_date = regex.search(
rf"{lookbehind}{day_regex}{sep_regex}{month_regex}{sep_regex}{year_regex}{lookahead}",
eur_date_regex,
string,
)

# template: March 9, 1908 | March5,1908 | May 11, 1906
all_months = "|".join(_months_dict)
monthname_date_regex = rf"{lookbehind}({all_months}) ?{day_regex}, ?{year_regex}{lookahead}"
monthname_date = regex.search(monthname_date_regex, string)

# template: 1849/50 | 1849-50 | 1849/1850
year_range = regex.search(lookbehind + year_regex + r"[/-](\d{1,4})" + lookahead, string)

# template: 1907
year_only = regex.search(rf"{lookbehind}{year_regex}{lookahead}", string)

Expand Down Expand Up @@ -233,13 +245,25 @@ def _from_iso_date(iso_date: Match[str]) -> str:
return f"GREGORIAN:CE:{date.isoformat()}:CE:{date.isoformat()}"


def _expand_2_digit_year(year: int) -> int:
current_year = datetime.date.today().year - 2000
if year <= current_year:
return year + 2000
elif year <= 99:
return year + 1900
else:
return year


def _from_eur_date_range(eur_date_range: Match[str]) -> str:
startday = int(eur_date_range.group(1))
startmonth = int(eur_date_range.group(2)) if eur_date_range.group(2) else int(eur_date_range.group(5))
startyear = int(eur_date_range.group(3)) if eur_date_range.group(3) else int(eur_date_range.group(6))
startyear = _expand_2_digit_year(startyear)
endday = int(eur_date_range.group(4))
endmonth = int(eur_date_range.group(5))
endyear = int(eur_date_range.group(6))
endyear = _expand_2_digit_year(endyear)
startdate = datetime.date(startyear, startmonth, startday)
enddate = datetime.date(endyear, endmonth, endday)
if enddate <= startdate:
Expand All @@ -251,6 +275,7 @@ def _from_eur_date(eur_date: Match[str]) -> str:
startday = int(eur_date.group(1))
startmonth = int(eur_date.group(2))
startyear = int(eur_date.group(3))
startyear = _expand_2_digit_year(startyear)
date = datetime.date(startyear, startmonth, startday)
return f"GREGORIAN:CE:{date.isoformat()}:CE:{date.isoformat()}"

Expand Down
26 changes: 26 additions & 0 deletions test/unittests/commands/excel2xml/test_excel2xml_lib.py
@@ -1,3 +1,4 @@
import datetime
import unittest
import warnings
from pathlib import Path
Expand Down Expand Up @@ -204,13 +205,26 @@ def test_find_date_in_string_eur_date(self) -> None:
assert excel2xml.find_date_in_string("x -2193_01_26- x") == "GREGORIAN:CE:2193-01-26:CE:2193-01-26"
assert excel2xml.find_date_in_string("x 2193_02_30 x") is None

def test_find_date_in_string_eur_date_2_digit(self) -> None:
cur = str(datetime.date.today().year - 2000) # in 2024, this will be "24"
nxt = str(datetime.date.today().year - 2000 + 1) # in 2024, this will be "25"
assert excel2xml.find_date_in_string(f"x 30.4.{cur} x") == f"GREGORIAN:CE:20{cur}-04-30:CE:20{cur}-04-30"
assert excel2xml.find_date_in_string(f"x 30.4.{nxt} x") == f"GREGORIAN:CE:19{nxt}-04-30:CE:19{nxt}-04-30"
assert excel2xml.find_date_in_string(f"x 31.4.{nxt} x") is None

def test_find_date_in_string_eur_date_range(self) -> None:
"""template: 27.-28.1.1900"""
assert excel2xml.find_date_in_string("x 25.-26.2.0800 x") == "GREGORIAN:CE:0800-02-25:CE:0800-02-26"
assert excel2xml.find_date_in_string("x 25. - 26.2.0800 x") == "GREGORIAN:CE:0800-02-25:CE:0800-02-26"
assert excel2xml.find_date_in_string("x 25.-24.2.0800 x") is None
assert excel2xml.find_date_in_string("x 25.-25.2.0800 x") is None

def test_find_date_in_string_eur_date_range_2_digit(self) -> None:
cur = str(datetime.date.today().year - 2000) # in 2024, this will be "24"
nxt = str(datetime.date.today().year - 2000 + 1) # in 2024, this will be "25"
assert excel2xml.find_date_in_string(f"x 15.-16.4.{cur} x") == f"GREGORIAN:CE:20{cur}-04-15:CE:20{cur}-04-16"
assert excel2xml.find_date_in_string(f"x 15.-16.4.{nxt} x") == f"GREGORIAN:CE:19{nxt}-04-15:CE:19{nxt}-04-16"

def test_find_date_in_string_eur_date_range_across_month(self) -> None:
"""template: 26.2.-24.3.1948"""
assert excel2xml.find_date_in_string("x _1.3. - 25.4.2022_ x") == "GREGORIAN:CE:2022-03-01:CE:2022-04-25"
Expand All @@ -220,6 +234,12 @@ def test_find_date_in_string_eur_date_range_across_month(self) -> None:
assert excel2xml.find_date_in_string("x 28.2.-26.2.1515 x") is None
assert excel2xml.find_date_in_string("x 28.2.-28.2.1515 x") is None

def test_find_date_in_string_eur_date_range_across_month_2_digit(self) -> None:
cur = str(datetime.date.today().year - 2000) # in 2024, this will be "24"
nxt = str(datetime.date.today().year - 2000 + 1) # in 2024, this will be "25"
assert excel2xml.find_date_in_string(f"x 15.04.-1.5.{cur} x") == f"GREGORIAN:CE:20{cur}-04-15:CE:20{cur}-05-01"
assert excel2xml.find_date_in_string(f"x 15.04.-1.5.{nxt} x") == f"GREGORIAN:CE:19{nxt}-04-15:CE:19{nxt}-05-01"

def test_find_date_in_string_eur_date_range_across_year(self) -> None:
"""template: 1.12.1973 - 6.1.1974"""
assert excel2xml.find_date_in_string("x 1.9.2022-3.1.2024 x") == "GREGORIAN:CE:2022-09-01:CE:2024-01-03"
Expand All @@ -231,6 +251,12 @@ def test_find_date_in_string_eur_date_range_across_year(self) -> None:
assert excel2xml.find_date_in_string("x 25/12/2022-03/01/2022 x") is None
assert excel2xml.find_date_in_string("x 25/12/2022-25/12/2022 x") is None

def test_find_date_in_string_eur_date_range_across_year_2_digit(self) -> None:
cur = str(datetime.date.today().year - 2000) # in 2024, this will be "24"
nxt = str(datetime.date.today().year - 2000 + 1) # in 2024, this will be "25"
assert excel2xml.find_date_in_string(f"x 15.04.23-1.5.{cur} x") == f"GREGORIAN:CE:2023-04-15:CE:20{cur}-05-01"
assert excel2xml.find_date_in_string(f"x 15.04.{nxt}-1.5.26 x") == f"GREGORIAN:CE:19{nxt}-04-15:CE:1926-05-01"

def test_find_date_in_string_monthname(self) -> None:
"""template: February 9, 1908 | Dec 5,1908"""
assert excel2xml.find_date_in_string("x Jan 26, 1993 x") == "GREGORIAN:CE:1993-01-26:CE:1993-01-26"
Expand Down

0 comments on commit 15d3493

Please sign in to comment.