Skip to content

Commit

Permalink
don't follow off site CSS links
Browse files Browse the repository at this point in the history
  • Loading branch information
mihneadb committed Sep 19, 2012
1 parent cd2065f commit fbc2a67
Showing 1 changed file with 7 additions and 10 deletions.
17 changes: 7 additions & 10 deletions spade/scraper/middlewares.py
Expand Up @@ -7,11 +7,8 @@
from scrapy import log from scrapy import log





def has_extension(response_or_request, ext):
def is_css_or_js(response_or_request): return urlparse_cached(response_or_request).path.split(".")[-1] == ext
exts = ["css", "js"]
return urlparse_cached(response_or_request).path.split(".")[-1] in exts





class OffsiteMiddleware(offsite.OffsiteMiddleware): class OffsiteMiddleware(offsite.OffsiteMiddleware):
Expand All @@ -32,11 +29,11 @@ def process_spider_output(self, response, result, spider):




def should_follow(self, response, request): def should_follow(self, response, request):
"""Only follow offsite links to CSS and JS files, not new pages.""" """Only follow offsite links to JS files, not new pages."""
res_url_data = urlparse_cached(response) res_url_data = urlparse_cached(response)
req_url_data = urlparse_cached(request) req_url_data = urlparse_cached(request)


if is_css_or_js(request): if has_extension(request, 'js'):
return True return True


# Otherwise, ensure that the domains share the same root origin # Otherwise, ensure that the domains share the same root origin
Expand All @@ -45,12 +42,12 @@ def should_follow(self, response, request):




class DepthMiddleware(depth.DepthMiddleware): class DepthMiddleware(depth.DepthMiddleware):
"""A depth middleware that exempts CSS and JS files.""" """A depth middleware that exempts JS files."""
def process_spider_output(self, response, result, spider): def process_spider_output(self, response, result, spider):
"""Ignore depth restrictions for CSS/JS links.""" """Ignore depth restrictions for JS links."""
check_depth = [] check_depth = []
for req in result or []: for req in result or []:
if isinstance(req, Request) and is_css_or_js(req): if isinstance(req, Request) and has_extension(req, 'js'):
yield req yield req
else: else:
check_depth.append(req) check_depth.append(req)
Expand Down

0 comments on commit fbc2a67

Please sign in to comment.