No need to keep extracted links as instance attribute. fixes scrapy#763

dangra · Jun 25, 2014 · 5305772 · 5305772
1 parent ccec728
commit 5305772
Showing 1 changed file with 4 additions and 7 deletions.
diff --git a/scrapy/contrib/linkextractors/lxmlhtml.py b/scrapy/contrib/linkextractors/lxmlhtml.py
@@ -34,8 +34,6 @@ def __init__(self, tag="a", attr="href", process=None, unique=False):
         self.process_attr = process if callable(process) else lambda v: v
         self.unique = unique
 
-        self.links = []
-
     def _iter_links(self, document):
         for el in document.iter(etree.Element):
             tag = _nons(el.tag)
@@ -46,6 +44,7 @@ def _iter_links(self, document):
                 yield (el, attrib, attribs[attrib])
 
     def _extract_links(self, selector, response_url, response_encoding, base_url):
+        links = []
         # hacky way to get the underlying lxml parsed document
         for el, attr, attr_val in self._iter_links(selector._root):
             if self.scan_tag(el.tag) and self.scan_attr(attr):
@@ -60,12 +59,10 @@ def _extract_links(self, selector, response_url, response_encoding, base_url):
                 url = urljoin(response_url, url)
                 link = Link(url, _collect_string_content(el) or u'',
                     nofollow=True if el.get('rel') == 'nofollow' else False)
-                self.links.append(link)
-
-        links = unique_list(self.links, key=lambda link: link.url) \
-                if self.unique else self.links
+                links.append(link)
 
-        return links
+        return unique_list(links, key=lambda link: link.url) \
+                if self.unique else links
 
     def extract_links(self, response):
         html = Selector(response)