dasch-swiss · jnussbaum · Aug 11, 2023 · Aug 11, 2023 · Aug 11, 2023 · Aug 11, 2023
diff --git a/src/dsp_tools/excel2xml.py b/src/dsp_tools/excel2xml.py
@@ -7,7 +7,6 @@
 import difflib
 import json
 import os
-import re
 import uuid
 import warnings
 from typing import Any, Callable, Iterable, Optional, Union
@@ -57,10 +56,10 @@ def make_xsd_id_compatible(string: str) -> str:
         raise BaseError(f"The input '{string}' cannot be transformed to an xsd:ID")
 
     # if start of string is neither letter nor underscore, add an underscore
-    res = re.sub(r"^(?=[^A-Za-z_])", "_", string)
+    res = regex.sub(r"^(?=[^A-Za-z_])", "_", string)
 
     # replace all illegal characters by underscore
-    res = re.sub(r"[^\w_\-.]", "_", res, flags=re.ASCII)
+    res = regex.sub(r"[^\w_\-.]", "_", res, flags=regex.ASCII)
 
     # add uuid
     _uuid = uuid.uuid4()
@@ -152,25 +151,28 @@ def find_date_in_string(string: str) -> Optional[str]:
     lookahead = r"(?![0-9A-Za-z])"
 
     # template: 2021-01-01 | 2015_01_02
-    iso_date = re.search(rf"{lookbehind}{year_regex}[_-]([0-1][0-9])[_-]([0-3][0-9]){lookahead}", string)
+    iso_date = regex.search(rf"{lookbehind}{year_regex}[_-]([0-1][0-9])[_-]([0-3][0-9]){lookahead}", string)
     # template: 6.-8.3.1948 | 6/2/1947 - 24.03.1948
     eur_date_range_regex = (
         rf"{lookbehind}"
         rf"{day_regex}{sep_regex}(?:{month_regex}{sep_regex}{year_regex}?)? ?(?:-|:|to) ?"
         rf"{day_regex}{sep_regex}{month_regex}{sep_regex}{year_regex}"
         rf"{lookahead}"
     )
-    eur_date_range = re.search(eur_date_range_regex, string)
+    eur_date_range = regex.search(eur_date_range_regex, string)
     # template: 1.4.2021 | 5/11/2021
-    eur_date = re.search(rf"{lookbehind}{day_regex}{sep_regex}{month_regex}{sep_regex}{year_regex}{lookahead}", string)
+    eur_date = regex.search(
+        rf"{lookbehind}{day_regex}{sep_regex}{month_regex}{sep_regex}{year_regex}{lookahead}",
+        string,
+    )
     # template: March 9, 1908 | March5,1908 | May 11, 1906
     all_months = "|".join(months_dict)
     monthname_date_regex = rf"{lookbehind}({all_months}) ?{day_regex}, ?{year_regex}{lookahead}"
-    monthname_date = re.search(monthname_date_regex, string)
+    monthname_date = regex.search(monthname_date_regex, string)
     # template: 1849/50 | 1849-50 | 1849/1850
-    year_range = re.search(lookbehind + year_regex + r"[/-](\d{2}|\d{4})" + lookahead, string)
+    year_range = regex.search(lookbehind + year_regex + r"[/-](\d{2}|\d{4})" + lookahead, string)
     # template: 1907
-    year_only = re.search(rf"{lookbehind}{year_regex}{lookahead}", string)
+    year_only = regex.search(rf"{lookbehind}{year_regex}{lookahead}", string)
 
     if iso_date:
         year = int(iso_date.group(1))
@@ -589,7 +591,7 @@ def make_color_prop(
 
     # check value type
     for val in values:
-        if not re.search(r"^#[0-9a-f]{6}$", str(val.value).strip(), flags=re.IGNORECASE):
+        if not regex.search(r"^#[0-9a-f]{6}$", str(val.value).strip(), flags=regex.IGNORECASE):
             raise BaseError(
                 f"Failed validation in resource '{calling_resource}', property '{name}': "
                 f"'{val.value}' is not a valid color."
@@ -670,7 +672,7 @@ def make_date_prop(
         + r"((:CE|:BCE)?(:\d{4})(-\d{1,2})?(-\d{1,2})?)?$"
     )
     for val in values:
-        if not re.search(validation_regex, str(val.value).strip()):
+        if not regex.search(validation_regex, str(val.value).strip()):
             raise BaseError(
                 f"Failed validation in resource '{calling_resource}', property '{name}': "
                 f"'{val.value}' is not a valid DSP date."
@@ -887,7 +889,7 @@ def make_geoname_prop(
 
     # check value type
     for val in values:
-        if not re.search(r"^[0-9]+$", str(val.value)):
+        if not regex.search(r"^[0-9]+$", str(val.value)):
             raise BaseError(
                 f"Failed validation in resource '{calling_resource}', property '{name}': "
                 f"'{val.value}' is not a geonames.org identifier."
@@ -1030,7 +1032,10 @@ def make_interval_prop(
 
     # check value type
     for val in values:
-        if not re.match(r"([+-]?([0-9]+([.][0-9]*)?|[.][0-9]+)):([+-]?([0-9]+([.][0-9]*)?|[.][0-9]+))", str(val.value)):
+        if not regex.match(
+            r"([+-]?([0-9]+([.][0-9]*)?|[.][0-9]+)):([+-]?([0-9]+([.][0-9]*)?|[.][0-9]+))",
+            str(val.value),
+        ):
             raise BaseError(
                 f"Failed validation in resource '{calling_resource}', property '{name}': "
                 f"'{val.value}' is not a valid DSP interval."
@@ -1284,7 +1289,7 @@ def make_text_prop(
             # enforce that the text is well-formed XML: serialize tag ...
             content = etree.tostring(value_, encoding="unicode")
             # ... insert text at the very end of the string, and add ending tag to the previously single <text/> tag ...
-            content = re.sub(r"/>$", f">{val.value}</text>", content)
+            content = regex.sub(r"/>$", f">{val.value}</text>", content)
             # ... try to parse it again
             try:
                 value_ = etree.fromstring(content)
@@ -1352,7 +1357,7 @@ def make_time_prop(
     # check value type
     validation_regex = r"^\d{4}-[0-1]\d-[0-3]\dT[0-2]\d:[0-5]\d:[0-5]\d(.\d{1,12})?(Z|[+-][0-1]\d:[0-5]\d)$"
     for val in values:
-        if not re.search(validation_regex, str(val.value)):
+        if not regex.search(validation_regex, str(val.value)):
             raise BaseError(
                 f"Failed validation in resource '{calling_resource}', property '{name}': "
                 f"'{val.value}' is not a valid DSP time."
@@ -1845,15 +1850,15 @@ def _read_cli_input_file(datafile: str) -> pd.DataFrame:
     Returns:
         a pandas DataFrame with the input data
     """
-    if re.search(r"\.csv$", datafile):
+    if regex.search(r"\.csv$", datafile):
         dataframe = pd.read_csv(
             filepath_or_buffer=datafile,
             encoding="utf_8_sig",  # utf_8_sig is the default encoding of Excel
             dtype="str",
             sep=None,
             engine="python",  # let the "python" engine detect the separator
         )
-    elif re.search(r"(\.xls|\.xlsx)$", datafile):
+    elif regex.search(r"(\.xls|\.xlsx)$", datafile):
         dataframe = pd.read_excel(io=datafile, dtype="str")
     else:
         raise BaseError(f"Cannot open file '{datafile}': Invalid extension. Allowed extensions: 'csv', 'xls', 'xlsx'")

diff --git a/src/dsp_tools/models/connection.py b/src/dsp_tools/models/connection.py
@@ -1,7 +1,7 @@
 import json
-import re
 from typing import Any, Optional, Union
 
+import regex
 import requests
 
 from dsp_tools.models.exceptions import BaseError
@@ -49,7 +49,7 @@ def __init__(self, server: str, prefixes: dict[str, str] = None):
         :param prefixes: Ontology prefixes used
         """
 
-        self._server = re.sub(r"\/$", "", server)
+        self._server = regex.sub(r"\/$", "", server)
         self._prefixes = prefixes
         self._token = None
         self._log = False

diff --git a/src/dsp_tools/models/exceptions.py b/src/dsp_tools/models/exceptions.py
@@ -1,7 +1,8 @@
 import json
-import re
 from typing import Optional
 
+import regex
+
 
 class BaseError(Exception):
     """
@@ -51,7 +52,7 @@ def __init__(
                 parsed_json = json.loads(json_content_of_api_response)
                 if "knora-api:error" in parsed_json:
                     knora_api_error = parsed_json["knora-api:error"]
-                    knora_api_error = re.sub(r"^dsp\.errors\.[A-Za-z]+?: ?", "", knora_api_error)
+                    knora_api_error = regex.sub(r"^dsp\.errors\.[A-Za-z]+?: ?", "", knora_api_error)
                     self.orig_err_msg_from_api = knora_api_error
             except json.JSONDecodeError:
                 pass

diff --git a/src/dsp_tools/models/helpers.py b/src/dsp_tools/models/helpers.py
@@ -1,10 +1,11 @@
 # pylint: disable=missing-class-docstring,missing-function-docstring
 
-import re
 from dataclasses import dataclass
 from enum import Enum, unique
 from typing import Any, Optional, Pattern, Union
 
+import regex
+
 from dsp_tools.models.exceptions import BaseError
 
 #
@@ -30,7 +31,7 @@ class OntoIri:
 
 
 class IriTest:  # pylint: disable=too-few-public-methods
-    __iri_regexp = re.compile("^(http)s?://([\\w\\.\\-~]+)?(:\\d{,6})?(/[\\w\\-~]+)*(#[\\w\\-~]*)?")
+    __iri_regexp = regex.compile("^(http)s?://([\\w\\.\\-~]+)?(:\\d{,6})?(/[\\w\\-~]+)*(#[\\w\\-~]*)?")
 
     @classmethod
     def test(cls, val: str) -> bool:
@@ -124,7 +125,7 @@ def __init__(self, context: Optional[dict[str, str]] = None):
         :param context: A dict of prefix - ontology-iri pairs
         """
         # regexp to test for a complete IRI (including fragment identifier)
-        self._exp = re.compile("^(http)s?://([\\w\\.\\-~]+)?(:\\d{,6})?(/[\\w\\-~]+)*(#[\\w\\-~]*)?")
+        self._exp = regex.compile("^(http)s?://([\\w\\.\\-~]+)?(:\\d{,6})?(/[\\w\\-~]+)*(#[\\w\\-~]*)?")
         self._context = ContextType({})
 
         # add ontologies from context, if any
@@ -306,7 +307,7 @@ def get_prefixed_iri(self, iri: Optional[str]) -> Optional[str]:
             return None
 
         # check if the iri already has the form "prefix:name"
-        m = re.match("([\\w-]+):([\\w-]+)", iri)
+        m = regex.match("([\\w-]+):([\\w-]+)", iri)
         if m and m.span()[1] == len(iri):
             return iri
 
@@ -399,13 +400,13 @@ def __init__(self, val: Any):
         :param val: xsd:dateTimeStamp as string, instance of "DateTimeStamp" or json-ld construct
         """
         if isinstance(val, str):
-            if not re.search(self._validation_regex, val):
+            if not regex.search(self._validation_regex, val):
                 raise BaseError(f"Invalid xsd:dateTimeStamp: '{val}'")
             self._dateTimeStamp = val
         elif isinstance(val, DateTimeStamp):
             self._dateTimeStamp = str(val)
         else:
-            if val.get("@type") == "xsd:dateTimeStamp" and re.search(self._validation_regex, str(val.get("@value"))):
+            if val.get("@type") == "xsd:dateTimeStamp" and regex.search(self._validation_regex, str(val.get("@value"))):
                 self._dateTimeStamp = val["@value"]
             else:
                 raise BaseError(f"Invalid xsd:dateTimeStamp: '{val}'")

diff --git a/src/dsp_tools/models/ontology.py b/src/dsp_tools/models/ontology.py
@@ -29,10 +29,11 @@
 
 import copy
 import json
-import re
 from typing import Any, Optional, Union
 from urllib.parse import quote_plus
 
+import regex
+
 from dsp_tools.models.connection import Connection
 from dsp_tools.models.exceptions import BaseError
 from dsp_tools.models.helpers import Actions, Context, DateTimeStamp, WithId
@@ -352,7 +353,7 @@ def getProjectOntologies(con: Connection, project_id: str) -> list["Ontology"]:
 
     @staticmethod
     def getOntologyFromServer(con: Connection, shortcode: str, name: str) -> "Ontology":
-        if re.search(r"[0-9A-F]{4}", shortcode):
+        if regex.search(r"[0-9A-F]{4}", shortcode):
             result = con.get("/ontology/" + shortcode + "/" + name + "/v2" + Ontology.ALL_LANGUAGES)
         else:
             result = con.get("/ontology/" + name + "/v2" + Ontology.ALL_LANGUAGES)

diff --git a/src/dsp_tools/models/permission.py b/src/dsp_tools/models/permission.py
@@ -1,9 +1,10 @@
 # pylint: disable=missing-class-docstring,missing-function-docstring
 
-import re
 from enum import Enum, unique
 from typing import Optional, Union
 
+import regex
+
 
 @unique
 class PermissionValue(Enum):
@@ -75,7 +76,7 @@ def fromString(cls, permstr: str):
         tmpstr = permstr.split("|")
         permissions: dict[PermissionValue, list[str]] = {}
         for s in tmpstr:
-            key, *vals = re.split("[\\s,]+", s)
+            key, *vals = regex.split("[\\s,]+", s)
             permissions[PermissionValue[key]] = vals
         return cls(permissions)
 

diff --git a/src/dsp_tools/models/propertyclass.py b/src/dsp_tools/models/propertyclass.py
@@ -1,10 +1,11 @@
 # pylint: disable=missing-class-docstring,missing-function-docstring,duplicate-code
 
 import json
-import re
 from typing import Any, Optional, Sequence, Union
 from urllib.parse import quote_plus
 
+import regex
+
 from dsp_tools.models.connection import Connection
 from dsp_tools.models.exceptions import BaseError
 from dsp_tools.models.helpers import Actions, Context, DateTimeStamp, WithId
@@ -302,7 +303,7 @@ def resolve_propref(resref: str):
                 return {"@id": "knora-api:" + resref}  # no ":", must be from knora-api!
 
         tmp = {}
-        exp = re.compile("^http.*")  # It is already a fully IRI
+        exp = regex.compile("^http.*")  # It is already a fully IRI
         if exp.match(self._ontology_id):
             propid = self._context.prefix_from_iri(self._ontology_id) + ":" + self._name
             ontid = self._ontology_id

diff --git a/src/dsp_tools/models/resource.py b/src/dsp_tools/models/resource.py
@@ -1,10 +1,11 @@
 import json
-import re
 from copy import deepcopy
 from dataclasses import dataclass
 from typing import Any, Optional, Type, Union
 from urllib.parse import quote_plus
 
+import regex
+
 from dsp_tools.models.bitstream import Bitstream
 from dsp_tools.models.connection import Connection
 from dsp_tools.models.exceptions import BaseError
@@ -354,11 +355,11 @@ class ResourceInstanceFactory:
 
     def __init__(self, con: Connection, projident: str) -> None:
         self._con = con
-        if re.match("^[0-9a-fA-F]{4}$", projident):
+        if regex.match("^[0-9a-fA-F]{4}$", projident):
             project = Project(con=con, shortcode=projident)
-        elif re.match("^[\\w-]+$", projident):
+        elif regex.match("^[\\w-]+$", projident):
             project = Project(con=con, shortname=projident)
-        elif re.match("^(http)s?://([\\w\\.\\-~]+:?\\d{,4})(/[\\w\\-~]+)+$", projident):
+        elif regex.match("^(http)s?://([\\w\\.\\-~]+:?\\d{,4})(/[\\w\\-~]+)+$", projident):
             project = Project(con=con, shortname=projident)
         else:
             raise BaseError("Invalid project identification!")

diff --git a/src/dsp_tools/models/resourceclass.py b/src/dsp_tools/models/resourceclass.py
@@ -8,11 +8,12 @@
 # pylint: disable=missing-class-docstring,missing-function-docstring,too-many-instance-attributes,duplicate-code
 
 import json
-import re
 from enum import Enum
 from typing import Any, Optional, Sequence, Union
 from urllib.parse import quote_plus
 
+import regex
+
 from dsp_tools.models.connection import Connection
 from dsp_tools.models.exceptions import BaseError
 from dsp_tools.models.helpers import Actions, Cardinality, Context, DateTimeStamp
@@ -635,7 +636,7 @@ def resolve_resref(resref: str):
                 return {"@id": "knora-api:" + resref}  # no ":", must be from knora-api!
 
         tmp = {}
-        exp = re.compile("^http.*")  # It is already a fully IRI
+        exp = regex.compile("^http.*")  # It is already a fully IRI
         if exp.match(self._ontology_id):
             resid = self._context.prefix_from_iri(self._ontology_id) + ":" + self._name
             ontid = self._ontology_id