Permalink
Browse files

don't follow off site CSS links

  • Loading branch information...
1 parent cd2065f commit fbc2a67f087d66e33979de458a4c3be439af6b9f @mihneadb mihneadb committed Sep 19, 2012
Showing with 7 additions and 10 deletions.
  1. +7 −10 spade/scraper/middlewares.py
@@ -7,11 +7,8 @@
from scrapy import log
-
-def is_css_or_js(response_or_request):
- exts = ["css", "js"]
- return urlparse_cached(response_or_request).path.split(".")[-1] in exts
-
+def has_extension(response_or_request, ext):
+ return urlparse_cached(response_or_request).path.split(".")[-1] == ext
class OffsiteMiddleware(offsite.OffsiteMiddleware):
@@ -32,11 +29,11 @@ def process_spider_output(self, response, result, spider):
def should_follow(self, response, request):
- """Only follow offsite links to CSS and JS files, not new pages."""
+ """Only follow offsite links to JS files, not new pages."""
res_url_data = urlparse_cached(response)
req_url_data = urlparse_cached(request)
- if is_css_or_js(request):
+ if has_extension(request, 'js'):
return True
# Otherwise, ensure that the domains share the same root origin
@@ -45,12 +42,12 @@ def should_follow(self, response, request):
class DepthMiddleware(depth.DepthMiddleware):
- """A depth middleware that exempts CSS and JS files."""
+ """A depth middleware that exempts JS files."""
def process_spider_output(self, response, result, spider):
- """Ignore depth restrictions for CSS/JS links."""
+ """Ignore depth restrictions for JS links."""
check_depth = []
for req in result or []:
- if isinstance(req, Request) and is_css_or_js(req):
+ if isinstance(req, Request) and has_extension(req, 'js'):
yield req
else:
check_depth.append(req)

0 comments on commit fbc2a67

Please sign in to comment.