Skip to content

Commit

Permalink
Add the ability to retrieve all urls and follow redirects, fix #1
Browse files Browse the repository at this point in the history
  • Loading branch information
davidbgk committed Jun 5, 2015
1 parent 94faa49 commit 83dcdf3
Show file tree
Hide file tree
Showing 7 changed files with 82 additions and 13 deletions.
6 changes: 5 additions & 1 deletion croquemort/crawler.py
Expand Up @@ -25,15 +25,19 @@ def check_url(self, url_group_frequency):
url, group, frequency = url_group_frequency
log(('Checking {url} for group {group} and frequency "{frequency}"'
.format(url=url, group=group, frequency=frequency)))
if not url.startswith('http'):
logbook.error('Error with {url}: not a URL'.format(url=url))
return
self.storage.store_url(url)
if group:
self.storage.store_group(url, group)
if frequency:
self.storage.store_frequency(url, group, frequency)
try:
response = session.head(url)
response = session.head(url, allow_redirects=True)
except requests.exceptions.ConnectionError:
response = FakeResponse(status_code=503, headers={})
except Exception as e:
logbook.error('Error with {url}: {e}'.format(url=url, e=e))
return
self.storage.store_metadata(url, response)
42 changes: 33 additions & 9 deletions croquemort/http.py
Expand Up @@ -8,7 +8,8 @@
from .logger import LoggingDependency
from .storages import RedisStorage
from .tools import (
apply_filters, extract_filters, generate_hash, required_parameters
apply_filters, data_from_request, extract_filters, generate_hash,
required_parameters
)

log = logbook.debug
Expand Down Expand Up @@ -44,13 +45,12 @@ def retrieve_group(self, data):
return self.retrieve_group_from_hash(data, generate_hash(group))

@http('GET', '/group/<group_hash>')
def retrieve_group_from_hash(self, request_or_data, group_hash):
def retrieve_group_from_hash(self, request, group_hash):
log('Retrieving group hash {hash}'.format(hash=group_hash))
if isinstance(request_or_data, dict):
data = request_or_data
else:
request_data = request_or_data.get_data().decode('utf-8')
data = json.loads(request_data or '{}')
try:
data = data_from_request(request)
except ValueError as error:
return 400, 'Incorrect parameters: {error}'.format(error=error)
group_infos = self.storage.get_group(group_hash)
if not group_infos:
return 404, ''
Expand All @@ -62,8 +62,30 @@ def retrieve_group_from_hash(self, request_or_data, group_hash):
results = apply_filters(url_infos, filters, excludes)
if results:
infos[url_hash] = results
log('Returning {num} results'.format(num=len(infos)))
return json.dumps(infos, indent=2)

@http('GET', '/')
def retrieve_urls(self, request):
log('Retrieving urls')
try:
data = data_from_request(request)
except ValueError as error:
return 400, 'Incorrect parameters: {error}'.format(error=error)
all_urls = self.storage.get_all_urls()
if not all_urls:
return 404, ''
infos = {}
filters, excludes = extract_filters(data)
for url_hash, url in all_urls:
url_infos = self.storage.get_url(url_hash)
results = apply_filters(url_infos, filters, excludes)
if results:
infos[url_hash] = results
log('Returning {num} results'.format(num=len(infos)))
return json.dumps(infos, indent=2)


@http('POST', '/check/one')
@required_parameters('url')
def check_one(self, data):
Expand All @@ -80,8 +102,10 @@ def check_many(self, data):
group = data.get('group')
group_hash = generate_hash(group)
frequency = data.get('frequency', None)
log(('Checking "{group}" ({hash}) with frequency "{frequency}"'
.format(group=group, hash=group_hash, frequency=frequency)))
log(('Checking {num} URLs in group "{group}" ({hash}) '
'with frequency "{frequency}"'.format(
num=len(urls), group=group, hash=group_hash,
frequency=frequency)))
for url in urls:
self.fetch(url, group, frequency)
return json.dumps({'group-hash': group_hash}, indent=2)
Expand Down
6 changes: 6 additions & 0 deletions croquemort/storages.py
Expand Up @@ -24,6 +24,10 @@ def __init__(self):
def get_dependency(self, worker_ctx):
return self

def get_all_urls(self):
for url_hash in self.database.lrange('urls', 0, -1):
yield url_hash, self.get_url(url_hash)

def get_url(self, url_hash):
return self.database.hgetall(str_to_bytes(url_hash))

Expand All @@ -33,6 +37,8 @@ def get_group(self, group_hash):
def store_url(self, url):
url_hash = generate_hash(url)
self.database.hset(url_hash, 'url', str_to_bytes(url))
if url_hash not in self.database.lrange('urls', 0, -1):
self.database.rpush('urls', str_to_bytes(url_hash))

def store_group(self, url, group):
url_hash = generate_hash(url)
Expand Down
11 changes: 9 additions & 2 deletions croquemort/tools.py
Expand Up @@ -7,6 +7,14 @@
log = logbook.debug


def data_from_request(request):
"""Return a dict of data from a JSON request, idempotent."""
if isinstance(request, dict):
return request
raw = request.get_data().decode('utf-8')
return json.loads(raw or '{}')


def required_parameters(*parameters):
"""A decorator for views with required parameters.
Expand All @@ -20,9 +28,8 @@ def required_parameters(*parameters):
def wrapper(wrapped, instance, args, kwargs):
args = list(args)
request = args[0]
raw = request.get_data().decode('utf-8')
try:
data = json.loads(raw or '{}')
data = data_from_request(request)
except ValueError as error:
return 400, 'Incorrect parameters: {error}'.format(error=error)
for parameter in parameters:
Expand Down
11 changes: 11 additions & 0 deletions tests/test_integrations.py
Expand Up @@ -30,6 +30,17 @@ def make_runner(*service_classes):
pass


def test_retrieve_urls(runner_factory, web_session):
runner = runner_factory(HttpService)
http_container = get_container(runner, HttpService)
storage = replace_dependencies(http_container, 'storage')
storage.get_url = lambda url_hash: {'url': url_hash}
storage.get_all_urls = lambda: (('hash', 'url'),)
runner.start()
rv = web_session.get('/')
assert rv.json()['hash'] == {'url': 'hash'}


def test_retrieve_url(runner_factory, web_session):
runner = runner_factory(HttpService)
http_container = get_container(runner, HttpService)
Expand Down
13 changes: 12 additions & 1 deletion tests/test_tools.py
@@ -1,4 +1,15 @@
from croquemort.tools import apply_filters, extract_filters, generate_hash
from unittest.mock import MagicMock

from croquemort.tools import (
apply_filters, data_from_request, extract_filters, generate_hash
)


def test_data_from_request():
assert data_from_request({'foo': 'bar'}) == {'foo': 'bar'}
request = MagicMock()
request.get_data = lambda: b'{"foo": "bar"}'
assert data_from_request(request) == {'foo': 'bar'}


def test_generate_hash():
Expand Down
6 changes: 6 additions & 0 deletions tests/test_urls.py
Expand Up @@ -12,6 +12,12 @@ def web_session(container_factory, web_config, web_session):
return web_session


def test_get_urls(web_session):
rv = web_session.get('/')
assert rv.text == '{}'
assert rv.status_code == 200


def test_get_url(web_session):
rv = web_session.get('/url', data=json.dumps({
'url': 'http://example.org'
Expand Down

0 comments on commit 83dcdf3

Please sign in to comment.