Skip to content

Commit

Permalink
feat(excel2xml): add support for 3-digit-years in find_date_in_string…
Browse files Browse the repository at this point in the history
…() (DEV-3005) (#652)
  • Loading branch information
jnussbaum committed Nov 23, 2023
1 parent 7347be7 commit 1afee03
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 15 deletions.
23 changes: 14 additions & 9 deletions src/dsp_tools/commands/excel2xml/excel2xml_lib.py
Expand Up @@ -68,7 +68,7 @@ def find_date_in_string(string: str) -> Optional[str]:
Notes:
- All dates are interpreted in the Christian era and the Gregorian calendar. There is no support for BC dates or
non-Gregorian calendars.
- The years 0000-2999 are supported, in 4-digit form.
- The years 0000-2999 are supported, in 3/4-digit form.
- Dates written with slashes are always interpreted in a European manner: 5/11/2021 is the 5th of November.
Currently supported date formats:
Expand All @@ -78,13 +78,15 @@ def find_date_in_string(string: str) -> Optional[str]:
- 5/11/2021 -> GREGORIAN:CE:2021-11-05:CE:2021-11-05
- Jan 26, 1993 -> GREGORIAN:CE:1993-01-26:CE:1993-01-26
- February26,2051 -> GREGORIAN:CE:2051-02-26:CE:2051-02-26
- 28.2.-1.12.1515 --> GREGORIAN:CE:1515-02-28:CE:1515-12-01
- 25.-26.2.0800 --> GREGORIAN:CE:0800-02-25:CE:0800-02-26
- 1.9.2022-3.1.2024 --> GREGORIAN:CE:2022-09-01:CE:2024-01-03
- 1848 -> GREGORIAN:CE:1848:CE:1848
- 28.2.-1.12.1515 -> GREGORIAN:CE:1515-02-28:CE:1515-12-01
- 25.-26.2.0800 -> GREGORIAN:CE:0800-02-25:CE:0800-02-26
- 1.9.2022-3.1.2024 -> GREGORIAN:CE:2022-09-01:CE:2024-01-03
- 800 -> GREGORIAN:CE:800:CE:800
- 1849/1850 -> GREGORIAN:CE:1849:CE:1850
- 1849/50 -> GREGORIAN:CE:1849:CE:1850
- 1845-50 -> GREGORIAN:CE:1845:CE:1850
- 840-50 -> GREGORIAN:CE:840:CE:850
- 840-1 -> GREGORIAN:CE:840:CE:841
Args:
string: string to check
Expand Down Expand Up @@ -134,7 +136,7 @@ def find_date_in_string(string: str) -> Optional[str]:
startyear: Optional[int] = None
endyear: Optional[int] = None

year_regex = r"([0-2][0-9][0-9][0-9])"
year_regex = r"([0-2]?[0-9][0-9][0-9])"
month_regex = r"([0-1]?[0-9])"
day_regex = r"([0-3]?[0-9])"
sep_regex = r"[\./]"
Expand All @@ -161,7 +163,7 @@ def find_date_in_string(string: str) -> Optional[str]:
monthname_date_regex = rf"{lookbehind}({all_months}) ?{day_regex}, ?{year_regex}{lookahead}"
monthname_date = regex.search(monthname_date_regex, string)
# template: 1849/50 | 1849-50 | 1849/1850
year_range = regex.search(lookbehind + year_regex + r"[/-](\d{2}|\d{4})" + lookahead, string)
year_range = regex.search(lookbehind + year_regex + r"[/-](\d{1,4})" + lookahead, string)
# template: 1907
year_only = regex.search(rf"{lookbehind}{year_regex}{lookahead}", string)

Expand Down Expand Up @@ -213,8 +215,11 @@ def find_date_in_string(string: str) -> Optional[str]:
elif year_range:
startyear = int(year_range.group(1))
endyear = int(year_range.group(2))
if endyear // 100 == 0:
# endyear is only 2-digit: add the first two digits of startyear
if endyear // 10 == 0:
# endyear is only 1-digit: add the first 2-3 digits of startyear
endyear = startyear // 10 * 10 + endyear
elif endyear // 100 == 0:
# endyear is only 2-digit: add the first 1-2 digits of startyear
endyear = startyear // 100 * 100 + endyear

elif year_only:
Expand Down
19 changes: 13 additions & 6 deletions test/unittests/commands/excel2xml/test_excel2xml_lib.py
Expand Up @@ -188,7 +188,7 @@ def test_make_xsd_id_compatible(self) -> None:
Path("special-characters.xml").unlink()

def test_find_date_in_string(self) -> None:
# template: 2021-01-01 | 2015_01_02
# template: 2021-01-01
testcases = {
"text 1492-10-12, text": "GREGORIAN:CE:1492-10-12:CE:1492-10-12",
"Text 0476-09-04. text": "GREGORIAN:CE:0476-09-04:CE:0476-09-04",
Expand All @@ -206,7 +206,7 @@ def test_find_date_in_string(self) -> None:
for testcase, expected in testcases.items():
self.assertEqual(excel2xml.find_date_in_string(testcase), expected, msg=f"Failed with '{testcase}'")

# template: 26.2.-24.3.1948
# template: 2015_01_02
testcases = {
"Text ...2193_01_26... text": "GREGORIAN:CE:2193-01-26:CE:2193-01-26",
"Text -2193_01_26- text": "GREGORIAN:CE:2193-01-26:CE:2193-01-26",
Expand All @@ -215,7 +215,7 @@ def test_find_date_in_string(self) -> None:
for testcase, expected in testcases.items():
self.assertEqual(excel2xml.find_date_in_string(testcase), expected, msg=f"Failed with '{testcase}'")

# template: 27.-28.1.1900
# template: 26.2.-24.3.1948
testcases = {
"Text _1.3. - 25.4.2022_ text": "GREGORIAN:CE:2022-03-01:CE:2022-04-25",
"Text (01.03. - 25.04.2022) text": "GREGORIAN:CE:2022-03-01:CE:2022-04-25",
Expand All @@ -225,7 +225,7 @@ def test_find_date_in_string(self) -> None:
for testcase, expected in testcases.items():
self.assertEqual(excel2xml.find_date_in_string(testcase), expected, msg=f"Failed with '{testcase}'")

# template: 1.12.1973 - 6.1.1974
# template: 27.-28.1.1900
testcases = {
"Text 25.-26.2.0800 text": "GREGORIAN:CE:0800-02-25:CE:0800-02-26",
"Text 25. - 26.2.0800 text": "GREGORIAN:CE:0800-02-25:CE:0800-02-26",
Expand All @@ -234,7 +234,7 @@ def test_find_date_in_string(self) -> None:
for testcase, expected in testcases.items():
self.assertEqual(excel2xml.find_date_in_string(testcase), expected, msg=f"Failed with '{testcase}'")

# template: 31.4.2021 | 5/11/2021
# template: 1.12.1973 - 6.1.1974
testcases = {
"Text 1.9.2022-3.1.2024 text": "GREGORIAN:CE:2022-09-01:CE:2024-01-03",
"Text 25.12.2022 - 3.1.2024 text": "GREGORIAN:CE:2022-12-25:CE:2024-01-03",
Expand All @@ -257,13 +257,20 @@ def test_find_date_in_string(self) -> None:
for testcase, expected in testcases.items():
self.assertEqual(excel2xml.find_date_in_string(testcase), expected, msg=f"Failed with '{testcase}'")

# template: 1907
# template: 1907 | 476
self.assertEqual(excel2xml.find_date_in_string("Text 1848 text"), "GREGORIAN:CE:1848:CE:1848")
self.assertEqual(excel2xml.find_date_in_string("Text 0476 text"), "GREGORIAN:CE:476:CE:476")
self.assertEqual(excel2xml.find_date_in_string("Text 476 text"), "GREGORIAN:CE:476:CE:476")

# template: 1849/50 | 1845-50 | 1849/1850
testcases = {
"Text 1849/1850? text": "GREGORIAN:CE:1849:CE:1850",
"Text 1845-1850, text": "GREGORIAN:CE:1845:CE:1850",
"Text 800-900, text": "GREGORIAN:CE:800:CE:900",
"Text 840-50, text": "GREGORIAN:CE:840:CE:850",
"Text 844-8, text": "GREGORIAN:CE:844:CE:848",
"Text 1840-1, text": "GREGORIAN:CE:1840:CE:1841",
"Text 0750-0760 text": "GREGORIAN:CE:750:CE:760",
"Text 1849/50. text": "GREGORIAN:CE:1849:CE:1850",
"Text (1845-50) text": "GREGORIAN:CE:1845:CE:1850",
"Text [1849/1850] text": "GREGORIAN:CE:1849:CE:1850",
Expand Down

0 comments on commit 1afee03

Please sign in to comment.