Skip to content

Commit

Permalink
No need to keep extracted links as instance attribute. fixes scrapy#763
Browse files Browse the repository at this point in the history
  • Loading branch information
dangra committed Jun 25, 2014
1 parent ccec728 commit 5305772
Showing 1 changed file with 4 additions and 7 deletions.
11 changes: 4 additions & 7 deletions scrapy/contrib/linkextractors/lxmlhtml.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,6 @@ def __init__(self, tag="a", attr="href", process=None, unique=False):
self.process_attr = process if callable(process) else lambda v: v
self.unique = unique

self.links = []

def _iter_links(self, document):
for el in document.iter(etree.Element):
tag = _nons(el.tag)
Expand All @@ -46,6 +44,7 @@ def _iter_links(self, document):
yield (el, attrib, attribs[attrib])

def _extract_links(self, selector, response_url, response_encoding, base_url):
links = []
# hacky way to get the underlying lxml parsed document
for el, attr, attr_val in self._iter_links(selector._root):
if self.scan_tag(el.tag) and self.scan_attr(attr):
Expand All @@ -60,12 +59,10 @@ def _extract_links(self, selector, response_url, response_encoding, base_url):
url = urljoin(response_url, url)
link = Link(url, _collect_string_content(el) or u'',
nofollow=True if el.get('rel') == 'nofollow' else False)
self.links.append(link)

links = unique_list(self.links, key=lambda link: link.url) \
if self.unique else self.links
links.append(link)

return links
return unique_list(links, key=lambda link: link.url) \
if self.unique else links

def extract_links(self, response):
html = Selector(response)
Expand Down

0 comments on commit 5305772

Please sign in to comment.