Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: replace re by regex (DEV-2531) #465

Merged
merged 4 commits into from
Aug 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
39 changes: 22 additions & 17 deletions src/dsp_tools/excel2xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import difflib
import json
import os
import re
import uuid
import warnings
from typing import Any, Callable, Iterable, Optional, Union
Expand Down Expand Up @@ -57,10 +56,10 @@ def make_xsd_id_compatible(string: str) -> str:
raise BaseError(f"The input '{string}' cannot be transformed to an xsd:ID")

# if start of string is neither letter nor underscore, add an underscore
res = re.sub(r"^(?=[^A-Za-z_])", "_", string)
res = regex.sub(r"^(?=[^A-Za-z_])", "_", string)

# replace all illegal characters by underscore
res = re.sub(r"[^\w_\-.]", "_", res, flags=re.ASCII)
res = regex.sub(r"[^\w_\-.]", "_", res, flags=regex.ASCII)

# add uuid
_uuid = uuid.uuid4()
Expand Down Expand Up @@ -152,25 +151,28 @@ def find_date_in_string(string: str) -> Optional[str]:
lookahead = r"(?![0-9A-Za-z])"

# template: 2021-01-01 | 2015_01_02
iso_date = re.search(rf"{lookbehind}{year_regex}[_-]([0-1][0-9])[_-]([0-3][0-9]){lookahead}", string)
iso_date = regex.search(rf"{lookbehind}{year_regex}[_-]([0-1][0-9])[_-]([0-3][0-9]){lookahead}", string)
# template: 6.-8.3.1948 | 6/2/1947 - 24.03.1948
eur_date_range_regex = (
rf"{lookbehind}"
rf"{day_regex}{sep_regex}(?:{month_regex}{sep_regex}{year_regex}?)? ?(?:-|:|to) ?"
rf"{day_regex}{sep_regex}{month_regex}{sep_regex}{year_regex}"
rf"{lookahead}"
)
eur_date_range = re.search(eur_date_range_regex, string)
eur_date_range = regex.search(eur_date_range_regex, string)
# template: 1.4.2021 | 5/11/2021
eur_date = re.search(rf"{lookbehind}{day_regex}{sep_regex}{month_regex}{sep_regex}{year_regex}{lookahead}", string)
eur_date = regex.search(
rf"{lookbehind}{day_regex}{sep_regex}{month_regex}{sep_regex}{year_regex}{lookahead}",
string,
)
# template: March 9, 1908 | March5,1908 | May 11, 1906
all_months = "|".join(months_dict)
monthname_date_regex = rf"{lookbehind}({all_months}) ?{day_regex}, ?{year_regex}{lookahead}"
monthname_date = re.search(monthname_date_regex, string)
monthname_date = regex.search(monthname_date_regex, string)
# template: 1849/50 | 1849-50 | 1849/1850
year_range = re.search(lookbehind + year_regex + r"[/-](\d{2}|\d{4})" + lookahead, string)
year_range = regex.search(lookbehind + year_regex + r"[/-](\d{2}|\d{4})" + lookahead, string)
# template: 1907
year_only = re.search(rf"{lookbehind}{year_regex}{lookahead}", string)
year_only = regex.search(rf"{lookbehind}{year_regex}{lookahead}", string)

if iso_date:
year = int(iso_date.group(1))
Expand Down Expand Up @@ -589,7 +591,7 @@ def make_color_prop(

# check value type
for val in values:
if not re.search(r"^#[0-9a-f]{6}$", str(val.value).strip(), flags=re.IGNORECASE):
if not regex.search(r"^#[0-9a-f]{6}$", str(val.value).strip(), flags=regex.IGNORECASE):
raise BaseError(
f"Failed validation in resource '{calling_resource}', property '{name}': "
f"'{val.value}' is not a valid color."
Expand Down Expand Up @@ -670,7 +672,7 @@ def make_date_prop(
+ r"((:CE|:BCE)?(:\d{4})(-\d{1,2})?(-\d{1,2})?)?$"
)
for val in values:
if not re.search(validation_regex, str(val.value).strip()):
if not regex.search(validation_regex, str(val.value).strip()):
raise BaseError(
f"Failed validation in resource '{calling_resource}', property '{name}': "
f"'{val.value}' is not a valid DSP date."
Expand Down Expand Up @@ -887,7 +889,7 @@ def make_geoname_prop(

# check value type
for val in values:
if not re.search(r"^[0-9]+$", str(val.value)):
if not regex.search(r"^[0-9]+$", str(val.value)):
raise BaseError(
f"Failed validation in resource '{calling_resource}', property '{name}': "
f"'{val.value}' is not a geonames.org identifier."
Expand Down Expand Up @@ -1030,7 +1032,10 @@ def make_interval_prop(

# check value type
for val in values:
if not re.match(r"([+-]?([0-9]+([.][0-9]*)?|[.][0-9]+)):([+-]?([0-9]+([.][0-9]*)?|[.][0-9]+))", str(val.value)):
if not regex.match(
r"([+-]?([0-9]+([.][0-9]*)?|[.][0-9]+)):([+-]?([0-9]+([.][0-9]*)?|[.][0-9]+))",
str(val.value),
):
raise BaseError(
f"Failed validation in resource '{calling_resource}', property '{name}': "
f"'{val.value}' is not a valid DSP interval."
Expand Down Expand Up @@ -1284,7 +1289,7 @@ def make_text_prop(
# enforce that the text is well-formed XML: serialize tag ...
content = etree.tostring(value_, encoding="unicode")
# ... insert text at the very end of the string, and add ending tag to the previously single <text/> tag ...
content = re.sub(r"/>$", f">{val.value}</text>", content)
content = regex.sub(r"/>$", f">{val.value}</text>", content)
# ... try to parse it again
try:
value_ = etree.fromstring(content)
Expand Down Expand Up @@ -1352,7 +1357,7 @@ def make_time_prop(
# check value type
validation_regex = r"^\d{4}-[0-1]\d-[0-3]\dT[0-2]\d:[0-5]\d:[0-5]\d(.\d{1,12})?(Z|[+-][0-1]\d:[0-5]\d)$"
for val in values:
if not re.search(validation_regex, str(val.value)):
if not regex.search(validation_regex, str(val.value)):
raise BaseError(
f"Failed validation in resource '{calling_resource}', property '{name}': "
f"'{val.value}' is not a valid DSP time."
Expand Down Expand Up @@ -1845,15 +1850,15 @@ def _read_cli_input_file(datafile: str) -> pd.DataFrame:
Returns:
a pandas DataFrame with the input data
"""
if re.search(r"\.csv$", datafile):
if regex.search(r"\.csv$", datafile):
dataframe = pd.read_csv(
filepath_or_buffer=datafile,
encoding="utf_8_sig", # utf_8_sig is the default encoding of Excel
dtype="str",
sep=None,
engine="python", # let the "python" engine detect the separator
)
elif re.search(r"(\.xls|\.xlsx)$", datafile):
elif regex.search(r"(\.xls|\.xlsx)$", datafile):
dataframe = pd.read_excel(io=datafile, dtype="str")
else:
raise BaseError(f"Cannot open file '{datafile}': Invalid extension. Allowed extensions: 'csv', 'xls', 'xlsx'")
Expand Down
4 changes: 2 additions & 2 deletions src/dsp_tools/models/connection.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import json
import re
from typing import Any, Optional, Union

import regex
import requests

from dsp_tools.models.exceptions import BaseError
Expand Down Expand Up @@ -49,7 +49,7 @@ def __init__(self, server: str, prefixes: dict[str, str] = None):
:param prefixes: Ontology prefixes used
"""

self._server = re.sub(r"\/$", "", server)
self._server = regex.sub(r"\/$", "", server)
self._prefixes = prefixes
self._token = None
self._log = False
Expand Down
5 changes: 3 additions & 2 deletions src/dsp_tools/models/exceptions.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import json
import re
from typing import Optional

import regex


class BaseError(Exception):
"""
Expand Down Expand Up @@ -51,7 +52,7 @@ def __init__(
parsed_json = json.loads(json_content_of_api_response)
if "knora-api:error" in parsed_json:
knora_api_error = parsed_json["knora-api:error"]
knora_api_error = re.sub(r"^dsp\.errors\.[A-Za-z]+?: ?", "", knora_api_error)
knora_api_error = regex.sub(r"^dsp\.errors\.[A-Za-z]+?: ?", "", knora_api_error)
self.orig_err_msg_from_api = knora_api_error
except json.JSONDecodeError:
pass
Expand Down
13 changes: 7 additions & 6 deletions src/dsp_tools/models/helpers.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
# pylint: disable=missing-class-docstring,missing-function-docstring

import re
from dataclasses import dataclass
from enum import Enum, unique
from typing import Any, Optional, Pattern, Union

import regex

from dsp_tools.models.exceptions import BaseError

#
Expand All @@ -30,7 +31,7 @@ class OntoIri:


class IriTest: # pylint: disable=too-few-public-methods
__iri_regexp = re.compile("^(http)s?://([\\w\\.\\-~]+)?(:\\d{,6})?(/[\\w\\-~]+)*(#[\\w\\-~]*)?")
__iri_regexp = regex.compile("^(http)s?://([\\w\\.\\-~]+)?(:\\d{,6})?(/[\\w\\-~]+)*(#[\\w\\-~]*)?")

@classmethod
def test(cls, val: str) -> bool:
Expand Down Expand Up @@ -124,7 +125,7 @@ def __init__(self, context: Optional[dict[str, str]] = None):
:param context: A dict of prefix - ontology-iri pairs
"""
# regexp to test for a complete IRI (including fragment identifier)
self._exp = re.compile("^(http)s?://([\\w\\.\\-~]+)?(:\\d{,6})?(/[\\w\\-~]+)*(#[\\w\\-~]*)?")
self._exp = regex.compile("^(http)s?://([\\w\\.\\-~]+)?(:\\d{,6})?(/[\\w\\-~]+)*(#[\\w\\-~]*)?")
self._context = ContextType({})

# add ontologies from context, if any
Expand Down Expand Up @@ -306,7 +307,7 @@ def get_prefixed_iri(self, iri: Optional[str]) -> Optional[str]:
return None

# check if the iri already has the form "prefix:name"
m = re.match("([\\w-]+):([\\w-]+)", iri)
m = regex.match("([\\w-]+):([\\w-]+)", iri)
if m and m.span()[1] == len(iri):
return iri

Expand Down Expand Up @@ -399,13 +400,13 @@ def __init__(self, val: Any):
:param val: xsd:dateTimeStamp as string, instance of "DateTimeStamp" or json-ld construct
"""
if isinstance(val, str):
if not re.search(self._validation_regex, val):
if not regex.search(self._validation_regex, val):
raise BaseError(f"Invalid xsd:dateTimeStamp: '{val}'")
self._dateTimeStamp = val
elif isinstance(val, DateTimeStamp):
self._dateTimeStamp = str(val)
else:
if val.get("@type") == "xsd:dateTimeStamp" and re.search(self._validation_regex, str(val.get("@value"))):
if val.get("@type") == "xsd:dateTimeStamp" and regex.search(self._validation_regex, str(val.get("@value"))):
self._dateTimeStamp = val["@value"]
else:
raise BaseError(f"Invalid xsd:dateTimeStamp: '{val}'")
Expand Down
5 changes: 3 additions & 2 deletions src/dsp_tools/models/ontology.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,11 @@

import copy
import json
import re
from typing import Any, Optional, Union
from urllib.parse import quote_plus

import regex

from dsp_tools.models.connection import Connection
from dsp_tools.models.exceptions import BaseError
from dsp_tools.models.helpers import Actions, Context, DateTimeStamp, WithId
Expand Down Expand Up @@ -352,7 +353,7 @@ def getProjectOntologies(con: Connection, project_id: str) -> list["Ontology"]:

@staticmethod
def getOntologyFromServer(con: Connection, shortcode: str, name: str) -> "Ontology":
if re.search(r"[0-9A-F]{4}", shortcode):
if regex.search(r"[0-9A-F]{4}", shortcode):
result = con.get("/ontology/" + shortcode + "/" + name + "/v2" + Ontology.ALL_LANGUAGES)
else:
result = con.get("/ontology/" + name + "/v2" + Ontology.ALL_LANGUAGES)
Expand Down
5 changes: 3 additions & 2 deletions src/dsp_tools/models/permission.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
# pylint: disable=missing-class-docstring,missing-function-docstring

import re
from enum import Enum, unique
from typing import Optional, Union

import regex


@unique
class PermissionValue(Enum):
Expand Down Expand Up @@ -75,7 +76,7 @@ def fromString(cls, permstr: str):
tmpstr = permstr.split("|")
permissions: dict[PermissionValue, list[str]] = {}
for s in tmpstr:
key, *vals = re.split("[\\s,]+", s)
key, *vals = regex.split("[\\s,]+", s)
permissions[PermissionValue[key]] = vals
return cls(permissions)

Expand Down
5 changes: 3 additions & 2 deletions src/dsp_tools/models/propertyclass.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
# pylint: disable=missing-class-docstring,missing-function-docstring,duplicate-code

import json
import re
from typing import Any, Optional, Sequence, Union
from urllib.parse import quote_plus

import regex

from dsp_tools.models.connection import Connection
from dsp_tools.models.exceptions import BaseError
from dsp_tools.models.helpers import Actions, Context, DateTimeStamp, WithId
Expand Down Expand Up @@ -302,7 +303,7 @@ def resolve_propref(resref: str):
return {"@id": "knora-api:" + resref} # no ":", must be from knora-api!

tmp = {}
exp = re.compile("^http.*") # It is already a fully IRI
exp = regex.compile("^http.*") # It is already a fully IRI
if exp.match(self._ontology_id):
propid = self._context.prefix_from_iri(self._ontology_id) + ":" + self._name
ontid = self._ontology_id
Expand Down
9 changes: 5 additions & 4 deletions src/dsp_tools/models/resource.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import json
import re
from copy import deepcopy
from dataclasses import dataclass
from typing import Any, Optional, Type, Union
from urllib.parse import quote_plus

import regex

from dsp_tools.models.bitstream import Bitstream
from dsp_tools.models.connection import Connection
from dsp_tools.models.exceptions import BaseError
Expand Down Expand Up @@ -354,11 +355,11 @@ class ResourceInstanceFactory:

def __init__(self, con: Connection, projident: str) -> None:
self._con = con
if re.match("^[0-9a-fA-F]{4}$", projident):
if regex.match("^[0-9a-fA-F]{4}$", projident):
project = Project(con=con, shortcode=projident)
elif re.match("^[\\w-]+$", projident):
elif regex.match("^[\\w-]+$", projident):
project = Project(con=con, shortname=projident)
elif re.match("^(http)s?://([\\w\\.\\-~]+:?\\d{,4})(/[\\w\\-~]+)+$", projident):
elif regex.match("^(http)s?://([\\w\\.\\-~]+:?\\d{,4})(/[\\w\\-~]+)+$", projident):
project = Project(con=con, shortname=projident)
else:
raise BaseError("Invalid project identification!")
Expand Down
5 changes: 3 additions & 2 deletions src/dsp_tools/models/resourceclass.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,12 @@
# pylint: disable=missing-class-docstring,missing-function-docstring,too-many-instance-attributes,duplicate-code

import json
import re
from enum import Enum
from typing import Any, Optional, Sequence, Union
from urllib.parse import quote_plus

import regex

from dsp_tools.models.connection import Connection
from dsp_tools.models.exceptions import BaseError
from dsp_tools.models.helpers import Actions, Cardinality, Context, DateTimeStamp
Expand Down Expand Up @@ -635,7 +636,7 @@ def resolve_resref(resref: str):
return {"@id": "knora-api:" + resref} # no ":", must be from knora-api!

tmp = {}
exp = re.compile("^http.*") # It is already a fully IRI
exp = regex.compile("^http.*") # It is already a fully IRI
if exp.match(self._ontology_id):
resid = self._context.prefix_from_iri(self._ontology_id) + ":" + self._name
ontid = self._ontology_id
Expand Down