Skip to content

Commit

Permalink
URLBear: Use library to extract links
Browse files Browse the repository at this point in the history
This replaces the use of regex for extracting
links with the use of the URLExtract library.

Closes #1342
  • Loading branch information
CLiu13 committed Dec 12, 2018
1 parent 2888639 commit b4acb90
Show file tree
Hide file tree
Showing 4 changed files with 66 additions and 33 deletions.
1 change: 1 addition & 0 deletions bear-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ rstcheck~=3.1
safety~=1.8.2
scspell3k~=2.0
sqlparse~=0.2.4
urlextract~=0.8.3
vim-vint~=0.3.12,!=0.3.19
vulture~=0.25.0
yamllint~=1.12.0
Expand Down
45 changes: 14 additions & 31 deletions bears/general/URLBear.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import re

from aenum import Flag
from urlextract import URLExtract

from coalib.bears.LocalBear import LocalBear
from dependency_management.requirements.PipRequirement import PipRequirement
Expand Down Expand Up @@ -70,43 +71,25 @@ def parse_pip_vcs_url(link):

@staticmethod
def extract_links_from_file(file, link_ignore_regex, link_ignore_list):
xmlns_regex = re.compile(r'xmlns:?\w*="(.*)"')
link_ignore_regex = re.compile(link_ignore_regex)
regex = re.compile(
r"""
((git\+|bzr\+|svn\+|hg\+|) # For VCS URLs
https?:// # http:// or https:// as only these
# are supported by the ``requests``
# library
[^.:%\s_/?#[\]@\\]+ # Initial part of domain
\. # A required dot `.`
(
((?:%[A-Fa-f0-9][A-Fa-f0-9])*[^\s()%\'"`<>|\\\[\]]+)
# Path name
# This part allows precentage
# encoding like %3F
# and does not allow
# any parenthesis: balanced or
# unbalanced.
| # OR
\((?:%[A-Fa-f0-9][A-Fa-f0-9])*[^\s()%\'"`<>|\\\[\]]*\)
# Path name contained within ()
# This part allows path names that
# are explicitly enclosed within one
# set of parenthesis.
# An example can be:
# http://wik.org/Hello_(Adele_song)/200
)
*)
# Thus, the whole part above
# prevents matching of
# Unbalanced parenthesis
(?<!\.)(?<!,) # Exclude trailing `.` or `,` from URL
(git\+|bzr\+|svn\+|hg\+|) # For VCS URLs
https?://.* # http:// or https:// as only these
# are supported by the ``requests``
# library
""", re.VERBOSE)
file_context = {}
extractor = URLExtract()
for line_number, line in enumerate(file):
xmlns_regex = re.compile(r'xmlns:?\w*="(.*)"')
for match in re.findall(regex, line):
link = match[0]
if not re.findall(regex, line):
continue
urls = set(extractor.find_urls(line) or [])
for url in urls:
# URLExtract does not remove trailing `.,|\` characters
# See https://github.com/lipoja/URLExtract/issues/13
link = url.rstrip('.,|\\')
link_context = file_context.get(link)
if not link_context:
link_context = LINK_CONTEXT.no_context
Expand Down
7 changes: 5 additions & 2 deletions tests/general/InvalidLinkBearTest.py
Original file line number Diff line number Diff line change
Expand Up @@ -370,8 +370,11 @@ def test_xml_namespaces(self):

self.check_validity(self.uut, valid_file)

# First 3 URLs are invalid because they have invalid TLD
# See line function _download_tlds_list(self) in file below
# https://github.com/lipoja/URLExtract/blob/master/urlextract.py
self.check_line_result_count(self.uut, invalid_file,
[1, 1, 1, 1])
[0, 0, 0, 1])

info_severity_file = """
<ruleset name="test" xmlns="http://this.is.a.namespace/ruleset/7.0.0"
Expand Down Expand Up @@ -413,7 +416,7 @@ def test_links_to_ignore(self):
http://example.co.in/404""".splitlines()

link_ignore_list = [
'http://coalaisthebest.com/',
'http://coalaisthebest.com',
'http://httpbin.org/status/4[0-9][0-9]',
'http://httpbin.org/status/410',
'http://httpbin.org/status/5[0-9][0-9]',
Expand Down
46 changes: 46 additions & 0 deletions tests/general/URLBearTest.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,16 @@ def test_detect_url_result(self):
[3, 'http://www.google.com/404',
LINK_CONTEXT.no_context])

def test_detect_pip_vcs_url_result(self):
valid_file = """
git+http://www.github.com/foo
""".splitlines()

result = get_results(self.uut, valid_file)
self.assertEqual(result[0].contents,
[2, 'http://www.github.com/foo',
LINK_CONTEXT.pip_vcs_url])

def test_precentage_encoded_url(self):
valid_file = """
# A url with a precentage-encoded character in path
Expand All @@ -53,6 +63,42 @@ def test_precentage_encoded_url(self):
'yes-green.svg/200'),
LINK_CONTEXT.no_context])

def test_detect_enclosed_parenthesis_url_result(self):
valid_file = """
http://wik.org/Hello_(Adele_song)/200
""".splitlines()

result = get_results(self.uut, valid_file)
self.assertEqual(result[0].contents,
[2, 'http://wik.org/Hello_(Adele_song)/200',
LINK_CONTEXT.no_context])

def test_detect_trailing_char_url_result(self):
valid_file = """
http://google.com/trailing.
""".splitlines()

result = get_results(self.uut, valid_file)
self.assertEqual(result[0].contents,
[2, 'http://google.com/trailing',
LINK_CONTEXT.no_context])

def test_detect_example_url_result(self):
invalid_file = """
http://example.com
""".splitlines()

result = get_results(self.uut, invalid_file)
self.assertEqual(result, [])

def test_detect_no_scheme_url_result(self):
invalid_file = """
foo.com
""".splitlines()

result = get_results(self.uut, invalid_file)
self.assertEqual(result, [])


class URLResultTest(unittest.TestCase):

Expand Down

0 comments on commit b4acb90

Please sign in to comment.