Skip to content

Commit

Permalink
htmlchecker: Support version sorting and filtering
Browse files Browse the repository at this point in the history
  • Loading branch information
gasinvein committed Jun 28, 2022
1 parent 90c84e9 commit 77eac1e
Show file tree
Hide file tree
Showing 4 changed files with 97 additions and 29 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,8 @@ set single pattern containing two nested match groups for both url and version:

To disable sorting and get first matched version/url, set `sort-matches` to `false`.

The [`versions`](#version-constraining) property is supported.

#### URL templates

The HTML checker also supports building the download URL using
Expand Down
83 changes: 54 additions & 29 deletions src/checkers/htmlchecker.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,39 +21,23 @@
import logging
import re
import urllib.parse
from distutils.version import LooseVersion
import io
import codecs
import typing as t

import aiohttp
from yarl import URL
import semver

from ..lib import NETWORK_ERRORS
from ..lib import NETWORK_ERRORS, OPERATORS_SCHEMA
from ..lib.externaldata import ExternalBase, ExternalData
from ..lib.errors import CheckerMetadataError, CheckerQueryError, CheckerFetchError
from ..lib.checkers import Checker
from ..lib.utils import filter_versioned_items, FallbackVersion

log = logging.getLogger(__name__)


def _get_latest(
html: str,
pattern: re.Pattern,
sort_key: t.Optional[t.Callable[[re.Match], t.Any]] = None,
) -> re.Match:
matches = list(pattern.finditer(html))
if not matches:
raise CheckerQueryError(f"Pattern '{pattern.pattern}' didn't match anything")
if sort_key is None or len(matches) == 1:
result = matches[0]
else:
log.debug("%s matched multiple times, selected latest", pattern.pattern)
result = max(matches, key=sort_key)
log.debug("%s matched %s", pattern.pattern, result)
return result


def _get_pattern(
checker_data: t.Dict, pattern_name: str, expected_groups: int = 1
) -> t.Optional[re.Pattern]:
Expand All @@ -73,6 +57,19 @@ def _get_pattern(
return pattern


def _semantic_version(version: str) -> semver.VersionInfo:
try:
return semver.VersionInfo.parse(version)
except ValueError as err:
raise CheckerQueryError("Can't parse version") from err


_VERSION_SCHEMES = {
"loose": FallbackVersion,
"semantic": _semantic_version,
}


class HTMLChecker(Checker):
CHECKER_DATA_TYPE = "html"
CHECKER_DATA_SCHEMA = {
Expand All @@ -83,6 +80,11 @@ class HTMLChecker(Checker):
"version-pattern": {"type": "string", "format": "regex"},
"url-template": {"type": "string", "format": "regex"},
"sort-matches": {"type": "boolean"},
"versions": OPERATORS_SCHEMA,
"version-scheme": {
"type": "string",
"enum": list(_VERSION_SCHEMES),
},
},
"allOf": [
{"required": ["url"]},
Expand Down Expand Up @@ -141,23 +143,46 @@ async def check(self, external_data: ExternalBase):
version_pattern = _get_pattern(external_data.checker_data, "version-pattern", 1)
url_template = external_data.checker_data.get("url-template")
sort_matches = external_data.checker_data.get("sort-matches", True)
version_cls = _VERSION_SCHEMES[
external_data.checker_data.get("version-scheme", "loose")
]
constraints = [
(o, version_cls(v))
for o, v in external_data.checker_data.get("versions", {}).items()
]
assert combo_pattern or (version_pattern and url_template)

html = await self._get_text(url)

def _get_latest(pattern: re.Pattern, ver_group: int) -> re.Match:
matches = filter_versioned_items(
items=pattern.finditer(html),
constraints=constraints,
to_version=lambda m: version_cls(m.group(ver_group)),
sort=sort_matches,
)
if not matches:
raise CheckerQueryError(
f"Pattern '{pattern.pattern}' didn't match anything"
)

try:
# NOTE Returning last match when sort is requested and first match otherwise
# doesn't seem sensible, but we need to retain backward compatibility
result = matches[-1 if sort_matches else 0]
except IndexError as err:
raise CheckerQueryError(
f"Pattern '{pattern.pattern}' didn't match anything"
) from err

log.debug("%s matched %s", pattern.pattern, result)
return result

if combo_pattern:
latest_url, latest_version = _get_latest(
html,
combo_pattern,
(lambda m: LooseVersion(m.group(2))) if sort_matches else None,
).group(1, 2)
latest_url, latest_version = _get_latest(combo_pattern, 2).group(1, 2)
else:
assert version_pattern and url_template
latest_version = _get_latest(
html,
version_pattern,
(lambda m: LooseVersion(m.group(1))) if sort_matches else None,
).group(1)
latest_version = _get_latest(version_pattern, 1).group(1)
latest_url = self._substitute_placeholders(url_template, latest_version)

abs_url = urllib.parse.urljoin(base=url, url=latest_url)
Expand Down
28 changes: 28 additions & 0 deletions tests/org.x.xeyes.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,34 @@ modules:
pattern: <link>(https://sourceforge.net/.+/qrupdate-([\d\.]+\d)\.tar\.gz)/download</link>
sort-matches: false

- name: libX11
sources:
- type: archive
url: http://some-incorrect.url/libX11.tar.gz
sha256: "0000000000000000000000000000000000000000000000000000000000000000"
x-checker-data:
type: html
url: https://www.x.org/releases/individual/lib/
version-pattern: libX11-([\d\.]+).tar.gz
url-template: libX11-$version.tar.gz
versions:
==: 1.7.5

- name: semver
sources:
- type: file
url: http://example.com/semver.txt
sha256: "0000000000000000000000000000000000000000000000000000000000000000"
x-checker-data:
type: html
# printf '%s\n' v1.0.0 v1.0.0+patch1 v2.0.0-rc1 v2.0.0 | base64
url: http://httpbingo.org/base64/djEuMC4wCnYxLjAuMCtwYXRjaDEKdjIuMC4wLXJjMQp2Mi4wLjAK
version-pattern: v(\d.*)
url-template: http://httpbingo.org/base64/encode/$version
versions:
<: 2.0.0-alpha
version-scheme: semantic

- name: libFS
sources:
- type: archive
Expand Down
13 changes: 13 additions & 0 deletions tests/test_htmlchecker.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,8 @@ async def test_check(self):
self._test_combo_pattern_nosort(
self._find_by_filename(ext_data, "qrupdate-1.1.0.tar.gz")
)
self._test_version_filter(self._find_by_filename(ext_data, "libX11.tar.gz"))
self._test_semver_filter(self._find_by_filename(ext_data, "semver.txt"))
self._test_no_match(self._find_by_filename(ext_data, "libFS-1.0.7.tar.bz2"))
self._test_invalid_url(self._find_by_filename(ext_data, "libdoesntexist.tar"))

Expand Down Expand Up @@ -144,6 +146,17 @@ def _test_combo_pattern_nosort(self, data):
),
)

def _test_version_filter(self, data):
self.assertIsNotNone(data)
self.assertIsNotNone(data.new_version)
self.assertEqual(data.new_version.version, "1.7.5")

def _test_semver_filter(self, data):
self.assertIsNotNone(data)
self.assertIsNotNone(data.new_version)
self.assertIsNotNone(data.new_version.version)
self.assertEqual(data.new_version.version, "1.0.0+patch1")

def _test_no_match(self, data):
self.assertIsNotNone(data)
self.assertIsNone(data.new_version)
Expand Down

0 comments on commit 77eac1e

Please sign in to comment.