Skip to content

Commit

Permalink
fix(excel2xml): find_date_in_string() must not return invalid date (D…
Browse files Browse the repository at this point in the history
  • Loading branch information
jnussbaum committed Dec 15, 2023
1 parent b0f598c commit 71b754d
Show file tree
Hide file tree
Showing 2 changed files with 80 additions and 87 deletions.
4 changes: 3 additions & 1 deletion src/dsp_tools/commands/excel2xml/excel2xml_lib.py
Expand Up @@ -242,7 +242,7 @@ def _from_eur_date_range(eur_date_range: Match[str]) -> str:
endyear = int(eur_date_range.group(6))
startdate = datetime.date(startyear, startmonth, startday)
enddate = datetime.date(endyear, endmonth, endday)
if enddate < startdate:
if enddate <= startdate:
raise ValueError
return f"GREGORIAN:CE:{startdate.isoformat()}:CE:{enddate.isoformat()}"

Expand Down Expand Up @@ -272,6 +272,8 @@ def _from_year_range(year_range: Match[str]) -> str:
elif endyear // 100 == 0:
# endyear is only 2-digit: add the first 1-2 digits of startyear
endyear = startyear // 100 * 100 + endyear
if endyear <= startyear:
raise ValueError
return f"GREGORIAN:CE:{startyear}:CE:{endyear}"


Expand Down
163 changes: 77 additions & 86 deletions test/unittests/commands/excel2xml/test_excel2xml_lib.py
Expand Up @@ -188,96 +188,87 @@ def test_make_xsd_id_compatible(self) -> None:
excel2xml.write_xml(root, "special-characters.xml")
Path("special-characters.xml").unlink()

def test_find_date_in_string(self) -> None:
# template: 2021-01-01
testcases = {
"text 1492-10-12, text": "GREGORIAN:CE:1492-10-12:CE:1492-10-12",
"Text 0476-09-04. text": "GREGORIAN:CE:0476-09-04:CE:0476-09-04",
"Text (0476-09-04) text": "GREGORIAN:CE:0476-09-04:CE:0476-09-04",
"Text [1492-10-32?] text": None,
}
for testcase, expected in testcases.items():
self.assertEqual(excel2xml.find_date_in_string(testcase), expected, msg=f"Failed with '{testcase}'")

# template: 31.4.2021 | 5/11/2021
testcases = {
"Text (30.4.2021) text": "GREGORIAN:CE:2021-04-30:CE:2021-04-30",
"Text (5/11/2021) text": "GREGORIAN:CE:2021-11-05:CE:2021-11-05",
}
for testcase, expected in testcases.items():
self.assertEqual(excel2xml.find_date_in_string(testcase), expected, msg=f"Failed with '{testcase}'")

# template: 2015_01_02
testcases = {
"Text ...2193_01_26... text": "GREGORIAN:CE:2193-01-26:CE:2193-01-26",
"Text -2193_01_26- text": "GREGORIAN:CE:2193-01-26:CE:2193-01-26",
"Text 2193_02_30 text": None,
}
for testcase, expected in testcases.items():
self.assertEqual(excel2xml.find_date_in_string(testcase), expected, msg=f"Failed with '{testcase}'")

# template: 26.2.-24.3.1948
testcases = {
"Text _1.3. - 25.4.2022_ text": "GREGORIAN:CE:2022-03-01:CE:2022-04-25",
"Text (01.03. - 25.04.2022) text": "GREGORIAN:CE:2022-03-01:CE:2022-04-25",
"Text 28.2.-1.12.1515 text": "GREGORIAN:CE:1515-02-28:CE:1515-12-01",
"Text 28.2.-26.2.1515 text": None,
}
for testcase, expected in testcases.items():
self.assertEqual(excel2xml.find_date_in_string(testcase), expected, msg=f"Failed with '{testcase}'")

# template: 27.-28.1.1900
testcases = {
"Text 25.-26.2.0800 text": "GREGORIAN:CE:0800-02-25:CE:0800-02-26",
"Text 25. - 26.2.0800 text": "GREGORIAN:CE:0800-02-25:CE:0800-02-26",
"Text 25.-24.2.0800 text": None,
}
for testcase, expected in testcases.items():
self.assertEqual(excel2xml.find_date_in_string(testcase), expected, msg=f"Failed with '{testcase}'")

# template: 1.12.1973 - 6.1.1974
testcases = {
"Text 1.9.2022-3.1.2024 text": "GREGORIAN:CE:2022-09-01:CE:2024-01-03",
"Text 25.12.2022 - 3.1.2024 text": "GREGORIAN:CE:2022-12-25:CE:2024-01-03",
"Text 25.12.2022-03.01.2022 text": None,
"Text 25/12/2022-03/01/2024 text": "GREGORIAN:CE:2022-12-25:CE:2024-01-03",
"Text 25/12/2022 - 3/1/2024 text": "GREGORIAN:CE:2022-12-25:CE:2024-01-03",
"Text 25/12/2022-03/01/2022 text": None,
}
for testcase, expected in testcases.items():
self.assertEqual(excel2xml.find_date_in_string(testcase), expected, msg=f"Failed with '{testcase}'")

# template: February 9, 1908 | Dec 5,1908
testcases = {
"Text Jan 26, 1993 text": "GREGORIAN:CE:1993-01-26:CE:1993-01-26",
"Text February26,2051 text": "GREGORIAN:CE:2051-02-26:CE:2051-02-26",
"Text Sept 1, 1000 text": "GREGORIAN:CE:1000-09-01:CE:1000-09-01",
"Text October 01, 1000 text": "GREGORIAN:CE:1000-10-01:CE:1000-10-01",
"Text Nov 6,1000 text": "GREGORIAN:CE:1000-11-06:CE:1000-11-06",
}
for testcase, expected in testcases.items():
self.assertEqual(excel2xml.find_date_in_string(testcase), expected, msg=f"Failed with '{testcase}'")

# template: 1907 | 476
def test_find_date_in_string_iso(self) -> None:
"""template: 2021-01-01"""
self.assertEqual(excel2xml.find_date_in_string("x 1492-10-12, x"), "GREGORIAN:CE:1492-10-12:CE:1492-10-12")
self.assertEqual(excel2xml.find_date_in_string("x 0476-09-04. x"), "GREGORIAN:CE:0476-09-04:CE:0476-09-04")
self.assertEqual(excel2xml.find_date_in_string("x (0476-09-04) x"), "GREGORIAN:CE:0476-09-04:CE:0476-09-04")
self.assertEqual(excel2xml.find_date_in_string("x [1492-10-32?] x"), None)

def test_find_date_in_string_eur_date(self) -> None:
"""template: 31.4.2021 | 5/11/2021 | 2015_01_02"""
self.assertEqual(excel2xml.find_date_in_string("x (30.4.2021) x"), "GREGORIAN:CE:2021-04-30:CE:2021-04-30")
self.assertEqual(excel2xml.find_date_in_string("x (5/11/2021) x"), "GREGORIAN:CE:2021-11-05:CE:2021-11-05")
self.assertEqual(excel2xml.find_date_in_string("x ...2193_01_26... x"), "GREGORIAN:CE:2193-01-26:CE:2193-01-26")
self.assertEqual(excel2xml.find_date_in_string("x -2193_01_26- x"), "GREGORIAN:CE:2193-01-26:CE:2193-01-26")
self.assertEqual(excel2xml.find_date_in_string("x 2193_02_30 x"), None)

def test_find_date_in_string_eur_date_range(self) -> None:
"""template: 27.-28.1.1900"""
self.assertEqual(excel2xml.find_date_in_string("x 25.-26.2.0800 x"), "GREGORIAN:CE:0800-02-25:CE:0800-02-26")
self.assertEqual(excel2xml.find_date_in_string("x 25. - 26.2.0800 x"), "GREGORIAN:CE:0800-02-25:CE:0800-02-26")
self.assertEqual(excel2xml.find_date_in_string("x 25.-24.2.0800 x"), None)
self.assertEqual(excel2xml.find_date_in_string("x 25.-25.2.0800 x"), None)

def test_find_date_in_string_eur_date_range_across_month(self) -> None:
"""template: 26.2.-24.3.1948"""
expected = "GREGORIAN:CE:2022-03-01:CE:2022-04-25"
self.assertEqual(excel2xml.find_date_in_string("x _1.3. - 25.4.2022_ x"), expected)
self.assertEqual(excel2xml.find_date_in_string("x (01.03. - 25.04.2022) x"), expected)

self.assertEqual(excel2xml.find_date_in_string("x 28.2.-1.12.1515 x"), "GREGORIAN:CE:1515-02-28:CE:1515-12-01")
self.assertEqual(excel2xml.find_date_in_string("x 28.2.-26.2.1515 x"), None)
self.assertEqual(excel2xml.find_date_in_string("x 28.2.-28.2.1515 x"), None)

def test_find_date_in_string_eur_date_range_across_year(self) -> None:
"""template: 1.12.1973 - 6.1.1974"""
expected = "GREGORIAN:CE:2022-09-01:CE:2024-01-03"
self.assertEqual(excel2xml.find_date_in_string("x 1.9.2022-3.1.2024 x"), expected)

expected = "GREGORIAN:CE:2022-12-25:CE:2024-01-03"
self.assertEqual(excel2xml.find_date_in_string("x 25.12.2022 - 3.1.2024 x"), expected)
self.assertEqual(excel2xml.find_date_in_string("x 25/12/2022-03/01/2024 x"), expected)
self.assertEqual(excel2xml.find_date_in_string("x 25/12/2022 - 3/1/2024 x"), expected)

self.assertEqual(excel2xml.find_date_in_string("x 25.12.2022-03.01.2022 x"), None)
self.assertEqual(excel2xml.find_date_in_string("x 25.12.2022-25.12.2022 x"), None)
self.assertEqual(excel2xml.find_date_in_string("x 25/12/2022-03/01/2022 x"), None)
self.assertEqual(excel2xml.find_date_in_string("x 25/12/2022-25/12/2022 x"), None)

def test_find_date_in_string_monthname(self) -> None:
"""template: February 9, 1908 | Dec 5,1908"""
self.assertEqual(excel2xml.find_date_in_string("x Jan 26, 1993 x"), "GREGORIAN:CE:1993-01-26:CE:1993-01-26")
self.assertEqual(excel2xml.find_date_in_string("x February26,2051 x"), "GREGORIAN:CE:2051-02-26:CE:2051-02-26")
self.assertEqual(excel2xml.find_date_in_string("x Sept 1, 1000 x"), "GREGORIAN:CE:1000-09-01:CE:1000-09-01")
self.assertEqual(excel2xml.find_date_in_string("x October 01, 1000 x"), "GREGORIAN:CE:1000-10-01:CE:1000-10-01")
self.assertEqual(excel2xml.find_date_in_string("x Nov 6,1000 x"), "GREGORIAN:CE:1000-11-06:CE:1000-11-06")

def test_find_date_in_string_single_year(self) -> None:
"""template: 1907 | 476"""
self.assertEqual(excel2xml.find_date_in_string("Text 1848 text"), "GREGORIAN:CE:1848:CE:1848")
self.assertEqual(excel2xml.find_date_in_string("Text 0476 text"), "GREGORIAN:CE:476:CE:476")
self.assertEqual(excel2xml.find_date_in_string("Text 476 text"), "GREGORIAN:CE:476:CE:476")

# template: 1849/50 | 1845-50 | 1849/1850
testcases = {
"Text 1849/1850? text": "GREGORIAN:CE:1849:CE:1850",
"Text 1845-1850, text": "GREGORIAN:CE:1845:CE:1850",
"Text 800-900, text": "GREGORIAN:CE:800:CE:900",
"Text 840-50, text": "GREGORIAN:CE:840:CE:850",
"Text 844-8, text": "GREGORIAN:CE:844:CE:848",
"Text 1840-1, text": "GREGORIAN:CE:1840:CE:1841",
"Text 0750-0760 text": "GREGORIAN:CE:750:CE:760",
"Text 1849/50. text": "GREGORIAN:CE:1849:CE:1850",
"Text (1845-50) text": "GREGORIAN:CE:1845:CE:1850",
"Text [1849/1850] text": "GREGORIAN:CE:1849:CE:1850",
}
for testcase, expected in testcases.items():
self.assertEqual(excel2xml.find_date_in_string(testcase), expected, msg=f"Failed with '{testcase}'")
def test_find_date_in_string_year_range(self) -> None:
"""template: 1849/50 | 1845-50 | 1849/1850"""
self.assertEqual(excel2xml.find_date_in_string("x 1849/1850? x"), "GREGORIAN:CE:1849:CE:1850")
self.assertEqual(excel2xml.find_date_in_string("x 1845-1850, x"), "GREGORIAN:CE:1845:CE:1850")
self.assertEqual(excel2xml.find_date_in_string("x 800-900, x"), "GREGORIAN:CE:800:CE:900")
self.assertEqual(excel2xml.find_date_in_string("x 840-50, x"), "GREGORIAN:CE:840:CE:850")
self.assertEqual(excel2xml.find_date_in_string("x 844-8, x"), "GREGORIAN:CE:844:CE:848")
self.assertEqual(excel2xml.find_date_in_string("x 1840-1, x"), "GREGORIAN:CE:1840:CE:1841")
self.assertEqual(excel2xml.find_date_in_string("x 0750-0760 x"), "GREGORIAN:CE:750:CE:760")
self.assertEqual(excel2xml.find_date_in_string("x 1849/50. x"), "GREGORIAN:CE:1849:CE:1850")
self.assertEqual(excel2xml.find_date_in_string("x (1845-50) x"), "GREGORIAN:CE:1845:CE:1850")
self.assertEqual(excel2xml.find_date_in_string("x [1849/1850] x"), "GREGORIAN:CE:1849:CE:1850")
self.assertEqual(excel2xml.find_date_in_string("x 1850-1849 x"), None)
self.assertEqual(excel2xml.find_date_in_string("x 1850-1850 x"), None)
self.assertEqual(excel2xml.find_date_in_string("x 830-20 x"), None)
self.assertEqual(excel2xml.find_date_in_string("x 830-30 x"), None)
self.assertEqual(excel2xml.find_date_in_string("x 1811-10 x"), None)
self.assertEqual(excel2xml.find_date_in_string("x 1811-11 x"), None)
self.assertEqual(excel2xml.find_date_in_string("x 1811/10 x"), None)
self.assertEqual(excel2xml.find_date_in_string("x 1811/11 x"), None)

def test_find_date_in_string_french_bc(self) -> None:
self.assertEqual(excel2xml.find_date_in_string("Text 12345 av. J.-C. text"), "GREGORIAN:BC:12345:BC:12345")
Expand Down

0 comments on commit 71b754d

Please sign in to comment.