Skip to content
Browse files

Resolve parent pointers.

  • Loading branch information...
1 parent c9ef391 commit e09f4d48f1e5b3eeb7b71fadd54c45bebbd5d082 @malthe malthe committed Aug 21, 2012
Showing with 35 additions and 1 deletion.
  1. +3 −0 CHANGES.rst
  2. +32 −1 src/collective/linkcheck/parse.py
View
3 CHANGES.rst
@@ -3,6 +3,9 @@ Changes
In next release ...
+- Resolve links with parent pointers ("../") to avoid duplicate
+ indexing.
+
- Always enter run loop and routinely poll for new sites.
- Fixed issue where the composite queue implementation would be used
View
33 src/collective/linkcheck/parse.py
@@ -1,3 +1,4 @@
+import os
import logging
import lxml.html
@@ -16,6 +17,36 @@ def iter_links(body):
tree = html.getroottree()
for link in html.iterfind('.//a'):
+ base = None
href = link.attrib.get('href')
- if href is not None:
+
+ while '../' in href:
+ if '://' not in href:
+ if base is None:
+ try:
+ base = html.find('.//base').attrib['href']
+ except BaseException:
+ base = ""
+ else:
+ base = base.rstrip('/') + '/'
+
+ if base:
+ href = base + href
+ href = '/' + href.split('://', 1)[1].split('/', 1)[-1]
+
+ i = href.find('../')
+ assert i > -1
+
+ if i == 0:
+ continue
+
+ previous = href.rfind('/', 0, i - 1)
+ after = href[i + 3:]
+
+ if previous == -1:
+ href = after
+ else:
+ href = href[:previous] + "/" + after
+
+ if href:
yield href

0 comments on commit e09f4d4

Please sign in to comment.
Something went wrong with that request. Please try again.