Skip to content

Commit

Permalink
Add a configurable delay between retries, re-introduce legacy samplin…
Browse files Browse the repository at this point in the history
…g-based soft404 detection as a fallback
  • Loading branch information
lphuberdeau committed May 9, 2018
1 parent c7fda26 commit 965659f
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 17 deletions.
4 changes: 3 additions & 1 deletion hammertime/engine/retry.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,14 @@

class RetryEngine(Engine):

def __init__(self, engine, *, loop, stats, retry_count=0):
def __init__(self, engine, *, loop, stats, retry_count=0, retry_delay=1.0):
self.request_engine = engine
self.retry_count = retry_count
self.stats = stats
self.general_limiter = asyncio.Semaphore(50, loop=loop)
self.priority_limiter = asyncio.Semaphore(10, loop=loop)
self.default_heuristics = None
self.retry_delay = retry_delay

async def perform(self, entry, heuristics):
if self.default_heuristics is None:
Expand All @@ -55,6 +56,7 @@ async def _perform(self, limiter, entry, heuristics):
entry.result.attempt += 1
self.stats.retries += 1
entry.response = None
await asyncio.sleep(self.retry_delay)

async def close(self):
if self.request_engine is not None:
Expand Down
38 changes: 28 additions & 10 deletions hammertime/rules/status.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,11 @@
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.


import asyncio
from urllib.parse import urljoin, urlparse
import os
from collections import defaultdict
from difflib import SequenceMatcher
import re
import random
import string
Expand All @@ -44,13 +44,14 @@ async def after_headers(self, entry):

class DetectSoft404:

def __init__(self, distance_threshold=5, match_filter=DEFAULT_FILTER, token_size=4):
def __init__(self, distance_threshold=5, match_filter=DEFAULT_FILTER, token_size=4, sample_length=5120):
self.engine = None
self.performed = defaultdict(dict)
self.soft_404_responses = defaultdict(dict)
self.distance_threshold = distance_threshold
self.match_filter = match_filter
self.token_size = token_size
self.sample_length = sample_length

def set_engine(self, engine):
self.engine = engine
Expand Down Expand Up @@ -100,13 +101,15 @@ async def _collect_sample(self, url, url_pattern):
request = Entry.create(url)
result = await self.engine.perform_high_priority(request, self.child_heuristics)
try:
simhash = Simhash(result.response.content, filter=self.match_filter, token_size=self.token_size).value
simhash = self._simhash(result.response)
return {"code": result.response.code,
"content_simhash": simhash,
"raw_content_hash": self._hash(result.response)}
"raw_content_hash": self._hash(result.response),
"content_sample": self._sample(result.response)}
except UnicodeDecodeError: # Response content is not text, store the hash of the raw data:
return {"code": result.response.code,
"raw_content_hash": self._hash(result.response)}
"raw_content_hash": self._hash(result.response),
"content_sample": self._sample(result.response)}

def _match(self, response, soft_404_response):
if soft_404_response["code"] == response.code:
Expand All @@ -116,17 +119,32 @@ def _match(self, response, soft_404_response):

if "content_simhash" in soft_404_response:
try:
resp_hash = Simhash(response.content, filter=self.match_filter, token_size=self.token_size)
resp_hash = Simhash(self._simhash(response))
distance = resp_hash.distance(Simhash(soft_404_response["content_simhash"]))
return distance < self.distance_threshold
if distance < self.distance_threshold:
return True
except UnicodeDecodeError: # response content is not text, cannot match text.
return False
else:
return False
pass

if self.sample_length and "content_sample" in soft_404_response:
sample = self._sample(response)
matcher = SequenceMatcher(isjunk=None, a=soft_404_response["content_sample"], b=sample, autojunk=False)

# This content is almost similar to a generated 404, therefore it's a 404.
if matcher.ratio() > 0.8:
return True

return False

def _hash(self, response):
return hashlib.md5(response.raw).digest()

def _simhash(self, response):
return Simhash(response.content, filter=self.match_filter, token_size=self.token_size).value

def _sample(self, response):
return response.raw[0:self.sample_length]

def _extract_pattern_from_url(self, url):
"""Return the path part of the URL with the last element replaced with its pattern in a regex-like format:
\l -> lowercase letters, same as [a-z]+
Expand Down
12 changes: 6 additions & 6 deletions tests/rules/reject_status_code_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.

from unittest import TestCase
from unittest.mock import MagicMock, patch
from unittest.mock import MagicMock, patch, ANY
from urllib.parse import urljoin, urlparse
import re
import uuid
Expand Down Expand Up @@ -129,10 +129,10 @@ async def test_add_alternate_url_response_to_knowledge_base(self):
simhash = Simhash(response.content).value
raw = self.rule._hash(response)
self.assertEqual(self.kb.soft_404_responses["http://example.com/"], {
"/\l": {"code": 200, "content_simhash": simhash, "raw_content_hash": raw},
"/\d/": {"code": 200, "content_simhash": simhash, "raw_content_hash": raw},
"/.\l": {"code": 200, "content_simhash": simhash, "raw_content_hash": raw},
"/123/\l.js": {"code": 200, "content_simhash": simhash, "raw_content_hash": raw}})
"/\l": {"code": 200, "content_simhash": simhash, "raw_content_hash": raw, "content_sample": ANY},
"/\d/": {"code": 200, "content_simhash": simhash, "raw_content_hash": raw, "content_sample": ANY},
"/.\l": {"code": 200, "content_simhash": simhash, "raw_content_hash": raw, "content_sample": ANY},
"/123/\l.js": {"code": 200, "content_simhash": simhash, "raw_content_hash": raw, "content_sample": ANY}})

@async_test()
async def test_add_None_to_knowledge_base_if_request_failed(self):
Expand All @@ -152,7 +152,7 @@ async def test_add_hash_of_raw_content_if_response_content_of_sample_is_not_text
await self.rule.on_request_successful(self.create_entry("http://example.com/test", response_content="response"))

self.assertEqual(self.kb.soft_404_responses["http://example.com/"], {
"/\l": {"code": 200, "raw_content_hash": hashlib.md5(bytes).digest()}})
"/\l": {"code": 200, "raw_content_hash": hashlib.md5(bytes).digest(), "content_sample": ANY}})

@async_test()
async def test_mark_request_has_soft404_if_pattern_and_response_match_request_in_knowledge_base(self):
Expand Down

0 comments on commit 965659f

Please sign in to comment.