URLBear: Use library to extract links

This replaces the use of regex for extracting links with the use of the URLExtract library. Closes #1342
coala · Dec 12, 2018 · b4acb90 · b4acb90
1 parent 2888639
commit b4acb90
Show file tree

Hide file tree

Showing 4 changed files with 66 additions and 33 deletions.
diff --git a/bear-requirements.txt b/bear-requirements.txt
@@ -36,6 +36,7 @@ rstcheck~=3.1
 safety~=1.8.2
 scspell3k~=2.0
 sqlparse~=0.2.4
+urlextract~=0.8.3
 vim-vint~=0.3.12,!=0.3.19
 vulture~=0.25.0
 yamllint~=1.12.0

diff --git a/bears/general/URLBear.py b/bears/general/URLBear.py
@@ -1,6 +1,7 @@
 import re
 
 from aenum import Flag
+from urlextract import URLExtract
 
 from coalib.bears.LocalBear import LocalBear
 from dependency_management.requirements.PipRequirement import PipRequirement
@@ -70,43 +71,25 @@ def parse_pip_vcs_url(link):
 
     @staticmethod
     def extract_links_from_file(file, link_ignore_regex, link_ignore_list):
+        xmlns_regex = re.compile(r'xmlns:?\w*="(.*)"')
         link_ignore_regex = re.compile(link_ignore_regex)
         regex = re.compile(
             r"""
-            ((git\+|bzr\+|svn\+|hg\+|)  # For VCS URLs
-            https?://                   # http:// or https:// as only these
-                                        # are supported by the ``requests``
-                                        # library
-            [^.:%\s_/?#[\]@\\]+         # Initial part of domain
-            \.                          # A required dot `.`
-            (
-                ((?:%[A-Fa-f0-9][A-Fa-f0-9])*[^\s()%\'"`<>|\\\[\]]+)
-                                        # Path name
-                                        # This part allows precentage
-                                        # encoding like %3F
-                                        # and does not allow
-                                        # any parenthesis: balanced or
-                                        # unbalanced.
-            |                           # OR
-                \((?:%[A-Fa-f0-9][A-Fa-f0-9])*[^\s()%\'"`<>|\\\[\]]*\)
-                                        # Path name contained within ()
-                                        # This part allows path names that
-                                        # are explicitly enclosed within one
-                                        # set of parenthesis.
-                                        # An example can be:
-                                        # http://wik.org/Hello_(Adele_song)/200
-            )
-            *)
-                                        # Thus, the whole part above
-                                        # prevents matching of
-                                        # Unbalanced parenthesis
-            (?<!\.)(?<!,)               # Exclude trailing `.` or `,` from URL
+            (git\+|bzr\+|svn\+|hg\+|)  # For VCS URLs
+            https?://.*                # http:// or https:// as only these
+                                       # are supported by the ``requests``
+                                       # library
             """, re.VERBOSE)
         file_context = {}
+        extractor = URLExtract()
         for line_number, line in enumerate(file):
-            xmlns_regex = re.compile(r'xmlns:?\w*="(.*)"')
-            for match in re.findall(regex, line):
-                link = match[0]
+            if not re.findall(regex, line):
+                continue
+            urls = set(extractor.find_urls(line) or [])
+            for url in urls:
+                # URLExtract does not remove trailing `.,|\` characters
+                # See https://github.com/lipoja/URLExtract/issues/13
+                link = url.rstrip('.,|\\')
                 link_context = file_context.get(link)
                 if not link_context:
                     link_context = LINK_CONTEXT.no_context

diff --git a/tests/general/InvalidLinkBearTest.py b/tests/general/InvalidLinkBearTest.py
@@ -370,8 +370,11 @@ def test_xml_namespaces(self):
 
             self.check_validity(self.uut, valid_file)
 
+            # First 3 URLs are invalid because they have invalid TLD
+            # See line function _download_tlds_list(self) in file below
+            # https://github.com/lipoja/URLExtract/blob/master/urlextract.py
             self.check_line_result_count(self.uut, invalid_file,
-                                         [1, 1, 1, 1])
+                                         [0, 0, 0, 1])
 
         info_severity_file = """
         <ruleset name="test" xmlns="http://this.is.a.namespace/ruleset/7.0.0"
@@ -413,7 +416,7 @@ def test_links_to_ignore(self):
         http://example.co.in/404""".splitlines()
 
         link_ignore_list = [
-                           'http://coalaisthebest.com/',
+                           'http://coalaisthebest.com',
                            'http://httpbin.org/status/4[0-9][0-9]',
                            'http://httpbin.org/status/410',
                            'http://httpbin.org/status/5[0-9][0-9]',

diff --git a/tests/general/URLBearTest.py b/tests/general/URLBearTest.py
@@ -37,6 +37,16 @@ def test_detect_url_result(self):
                          [3, 'http://www.google.com/404',
                           LINK_CONTEXT.no_context])
 
+    def test_detect_pip_vcs_url_result(self):
+        valid_file = """
+        git+http://www.github.com/foo
+        """.splitlines()
+
+        result = get_results(self.uut, valid_file)
+        self.assertEqual(result[0].contents,
+                         [2, 'http://www.github.com/foo',
+                          LINK_CONTEXT.pip_vcs_url])
+
     def test_precentage_encoded_url(self):
         valid_file = """
         # A url with a precentage-encoded character in path
@@ -53,6 +63,42 @@ def test_precentage_encoded_url(self):
                                'yes-green.svg/200'),
                               LINK_CONTEXT.no_context])
 
+    def test_detect_enclosed_parenthesis_url_result(self):
+        valid_file = """
+        http://wik.org/Hello_(Adele_song)/200
+        """.splitlines()
+
+        result = get_results(self.uut, valid_file)
+        self.assertEqual(result[0].contents,
+                         [2, 'http://wik.org/Hello_(Adele_song)/200',
+                          LINK_CONTEXT.no_context])
+
+    def test_detect_trailing_char_url_result(self):
+        valid_file = """
+        http://google.com/trailing.
+        """.splitlines()
+
+        result = get_results(self.uut, valid_file)
+        self.assertEqual(result[0].contents,
+                         [2, 'http://google.com/trailing',
+                          LINK_CONTEXT.no_context])
+
+    def test_detect_example_url_result(self):
+        invalid_file = """
+        http://example.com
+        """.splitlines()
+
+        result = get_results(self.uut, invalid_file)
+        self.assertEqual(result, [])
+
+    def test_detect_no_scheme_url_result(self):
+        invalid_file = """
+        foo.com
+        """.splitlines()
+
+        result = get_results(self.uut, invalid_file)
+        self.assertEqual(result, [])
+
 
 class URLResultTest(unittest.TestCase):