Browse files

Merge pull request #5 from chrisv2/improve_auth

improve authentication
  • Loading branch information...
2 parents 28f5c8b + eb69458 commit 716f75a5a23b1befa28bbf2da41df4335eb0bf5c @ericholscher committed Jun 29, 2015
Showing with 25 additions and 11 deletions.
  1. +24 −10 crawler/base.py
  2. +1 −1 crawler/management/commands/crawl.py
View
34 crawler/base.py
@@ -11,6 +11,7 @@
from crawler import signals as test_signals
from crawler.plugins.base import Plugin
+from django.core.management.base import CommandError
#Used for less useful debug output.
SUPER_DEBUG = 5
@@ -58,36 +59,45 @@ def __init__(self, base_url, conf_urls={}, verbosity=1, output_dir=None, ascend=
self.conf_urls = conf_urls
self.verbosity = verbosity
self.ascend = ascend
-
- auth = kwargs.get('auth')
+ self.auth = kwargs.get('auth')
if output_dir:
assert os.path.isdir(output_dir)
self.output_dir = os.path.realpath(output_dir)
LOG.info("Output will be saved to %s" % self.output_dir)
else:
self.output_dir = None
-
+
#These two are what keep track of what to crawl and what has been.
self.not_crawled = [(0, 'START',self.base_url)]
self.crawled = {}
self.c = Client(REMOTE_ADDR='127.0.0.1')
- if auth:
- printable_auth = ', '.join(
- '%s: %s' % (key, cleanse_setting(key.upper(), value))
- for key, value in auth.items())
- LOG.info('Log in with %s' % printable_auth)
- self.c.login(**auth)
-
+ #login, and remember the user which was logged in
+ if self.auth:
+ self._login(self.auth)
+ self.user = self.c.session['_auth_user_id']
+
self.plugins = []
for plug in Plugin.__subclasses__():
active = getattr(plug, 'active', True)
if active:
#TODO: Check if plugin supports writing CSV (or to a file in general?)
self.plugins.append(plug())
+ def _login(self, auth):
+ if not auth:
+ return
+ printable_auth = ', '.join(
+ '%s: %s' % (key, cleanse_setting(key.upper(), value))
+ for key, value in auth.items())
+ LOG.info('try logging in with %s' % printable_auth)
+ if self.c.login(**auth):
+ LOG.info('logged in successfully')
+ else:
+ raise CommandError("logon not possible, check credentials")
+
def _parse_urls(self, url, resp):
parsed = urlparse.urlparse(url)
@@ -134,6 +144,10 @@ def get_url(self, from_url, to_url):
test_signals.pre_request.send(self, url=to_url, request_dict=request_dict)
+ # check whether we are still logged in with correct uid, if not, relogin
+ if '_auth_user_id' not in self.c.session or self.c.session['_auth_user_id'] != self.user:
+ self._login(self.auth)
+
resp = self.c.get(url_path, request_dict, follow=False)
test_signals.post_request.send(self, url=to_url, response=resp)
View
2 crawler/management/commands/crawl.py
@@ -34,7 +34,7 @@ class Command(BaseCommand):
make_option('--no-parent', action='store_true', dest="no_parent", default=False,
help='Do not crawl URLs which do not start with your base URL'),
make_option('-a', "--auth", action='store', dest='auth', default=None,
- help='Authenticate (login:user,password:secret) before crawl')
+ help='Authenticate before crawl. Example: --auth username:foo,password:bar')
)
help = "Displays all of the url matching routes for the project."

0 comments on commit 716f75a

Please sign in to comment.