Skip to content

Commit

Permalink
URLBear: Use library to extract links
Browse files Browse the repository at this point in the history
This replaces the use of regex for extracting
links with the use of the URLExtract library.

Closes #1342
  • Loading branch information
CLiu13 committed Feb 18, 2019
1 parent 2888639 commit 75f1da7
Show file tree
Hide file tree
Showing 4 changed files with 71 additions and 38 deletions.
1 change: 1 addition & 0 deletions bear-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ rstcheck~=3.1
safety~=1.8.2
scspell3k~=2.0
sqlparse~=0.2.4
urlextract==0.9
vim-vint~=0.3.12,!=0.3.19
vulture~=0.25.0
yamllint~=1.12.0
Expand Down
45 changes: 14 additions & 31 deletions bears/general/URLBear.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import re

from aenum import Flag
from urlextract import URLExtract

from coalib.bears.LocalBear import LocalBear
from dependency_management.requirements.PipRequirement import PipRequirement
Expand Down Expand Up @@ -70,43 +71,25 @@ def parse_pip_vcs_url(link):

@staticmethod
def extract_links_from_file(file, link_ignore_regex, link_ignore_list):
xmlns_regex = re.compile(r'xmlns:?\w*="(.*)"')
link_ignore_regex = re.compile(link_ignore_regex)
regex = re.compile(
r"""
((git\+|bzr\+|svn\+|hg\+|) # For VCS URLs
https?:// # http:// or https:// as only these
# are supported by the ``requests``
# library
[^.:%\s_/?#[\]@\\]+ # Initial part of domain
\. # A required dot `.`
(
((?:%[A-Fa-f0-9][A-Fa-f0-9])*[^\s()%\'"`<>|\\\[\]]+)
# Path name
# This part allows precentage
# encoding like %3F
# and does not allow
# any parenthesis: balanced or
# unbalanced.
| # OR
\((?:%[A-Fa-f0-9][A-Fa-f0-9])*[^\s()%\'"`<>|\\\[\]]*\)
# Path name contained within ()
# This part allows path names that
# are explicitly enclosed within one
# set of parenthesis.
# An example can be:
# http://wik.org/Hello_(Adele_song)/200
)
*)
# Thus, the whole part above
# prevents matching of
# Unbalanced parenthesis
(?<!\.)(?<!,) # Exclude trailing `.` or `,` from URL
(git\+|bzr\+|svn\+|hg\+|) # For VCS URLs
https?://.* # http:// or https:// as only these
# are supported by the ``requests``
# library
""", re.VERBOSE)
file_context = {}
extractor = URLExtract()
for line_number, line in enumerate(file):
xmlns_regex = re.compile(r'xmlns:?\w*="(.*)"')
for match in re.findall(regex, line):
link = match[0]
if not re.findall(regex, line):
continue
urls = set(extractor.find_urls(line) or [])
for url in urls:
# URLExtract does not remove trailing `.,|\` characters
# See https://github.com/lipoja/URLExtract/issues/13
link = url.rstrip('.,|\\')
link_context = file_context.get(link)
if not link_context:
link_context = LINK_CONTEXT.no_context
Expand Down
17 changes: 10 additions & 7 deletions tests/general/InvalidLinkBearTest.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,7 +360,7 @@ def test_xml_namespaces(self):
""".splitlines()

invalid_file = """
<ruleset name="test" xmlns="http://this.isa.namespace/ruleset/7.0.0"
<ruleset name="test" xmlns="http://im.a.namespace.com/ruleset/7.0.0"
xmlns:xsi="http://this.is.another/kindof/namespace"
xsi:schemaLocation="http://this.namespace.dosent/exists/7.0.0"
xsi:schemaLocation="http://httpbin.com/404">""".splitlines()
Expand All @@ -370,8 +370,11 @@ def test_xml_namespaces(self):

self.check_validity(self.uut, valid_file)

# The 2nd and 3rd URLs are invalid because they have invalid TLD
# See function _download_tlds_list(self) in linked file below:
# https://github.com/lipoja/URLExtract/blob/master/urlextract.py
self.check_line_result_count(self.uut, invalid_file,
[1, 1, 1, 1])
[1, 0, 0, 1])

info_severity_file = """
<ruleset name="test" xmlns="http://this.is.a.namespace/ruleset/7.0.0"
Expand Down Expand Up @@ -413,7 +416,7 @@ def test_links_to_ignore(self):
http://example.co.in/404""".splitlines()

link_ignore_list = [
'http://coalaisthebest.com/',
'http://coalaisthebest.com',
'http://httpbin.org/status/4[0-9][0-9]',
'http://httpbin.org/status/410',
'http://httpbin.org/status/5[0-9][0-9]',
Expand Down Expand Up @@ -463,15 +466,15 @@ def response(status_code, *args, **kwargs):

self.check_validity(self.uut, ['https://gitmate.io'])
mock.assert_has_calls([
unittest.mock.call('https://facebook.com/', timeout=2,
unittest.mock.call('https://facebook.com', timeout=2,
allow_redirects=False),
unittest.mock.call('https://google.com/',
unittest.mock.call('https://google.com',
timeout=10, allow_redirects=False),
unittest.mock.call('https://coala.io/som/thingg/page/123',
timeout=25, allow_redirects=False),
unittest.mock.call('https://facebook.com/', timeout=20,
unittest.mock.call('https://facebook.com', timeout=20,
allow_redirects=False),
unittest.mock.call('https://google.com/',
unittest.mock.call('https://google.com',
timeout=20, allow_redirects=False),
unittest.mock.call('https://coala.io/som/thingg/page/123',
timeout=20, allow_redirects=False),
Expand Down
46 changes: 46 additions & 0 deletions tests/general/URLBearTest.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,16 @@ def test_detect_url_result(self):
[3, 'http://www.google.com/404',
LINK_CONTEXT.no_context])

def test_detect_pip_vcs_url_result(self):
valid_file = """
git+http://www.github.com/foo.git
""".splitlines()

result = get_results(self.uut, valid_file)
self.assertEqual(result[0].contents,
[2, 'http://www.github.com/foo.git',
LINK_CONTEXT.pip_vcs_url])

def test_precentage_encoded_url(self):
valid_file = """
# A url with a precentage-encoded character in path
Expand All @@ -53,6 +63,42 @@ def test_precentage_encoded_url(self):
'yes-green.svg/200'),
LINK_CONTEXT.no_context])

def test_detect_enclosed_parenthesis_url_result(self):
valid_file = """
http://wik.org/Hello_(Adele_song)/200
""".splitlines()

result = get_results(self.uut, valid_file)
self.assertEqual(result[0].contents,
[2, 'http://wik.org/Hello_(Adele_song)/200',
LINK_CONTEXT.no_context])

def test_detect_trailing_char_url_result(self):
valid_file = """
http://google.com/trailing.
""".splitlines()

result = get_results(self.uut, valid_file)
self.assertEqual(result[0].contents,
[2, 'http://google.com/trailing',
LINK_CONTEXT.no_context])

def test_detect_example_url_result(self):
invalid_file = """
http://example.com
""".splitlines()

result = get_results(self.uut, invalid_file)
self.assertEqual(result, [])

def test_detect_no_scheme_url_result(self):
invalid_file = """
foo.com
""".splitlines()

result = get_results(self.uut, invalid_file)
self.assertEqual(result, [])


class URLResultTest(unittest.TestCase):

Expand Down

0 comments on commit 75f1da7

Please sign in to comment.