merging new crawl work

ericholscher · Mar 30, 2009 · d8d6fa1 · d8d6fa1
2 parents ad6a5e9 + c2efc74
commit d8d6fa1
Show file tree

Hide file tree

Showing 3 changed files with 220 additions and 140 deletions.
diff --git a/test_utils/crawler.py b/test_utils/crawler.py
@@ -0,0 +1,200 @@
+from test_utils import signals as test_signals
+from django.test.client import Client
+from BeautifulSoup import BeautifulSoup
+import re, cgi, urlparse, time
+
+def _parse_urls(url, resp):
+    parsed = urlparse.urlparse(url)
+    soup = BeautifulSoup(resp.content)
+    returned_urls = []
+    hrefs = [a['href'] for a in soup.findAll('a') if a.has_key('href')]
+    for a in hrefs:
+        parsed_href = urlparse.urlparse(a)
+        if parsed_href.path.startswith('/') and not parsed_href.scheme:
+            returned_urls.append(a)
+        elif not parsed_href.scheme:
+            #Relative path = previous path + new path
+            returned_urls.append(parsed.path + a)
+    return returned_urls
+
+class Crawler(object):
+    """
+    This is a class that represents a URL crawler in python
+    """
+    def __init__(self, base_url, conf_urls={}, verbosity=1):
+        self.base_url = base_url
+        self.conf_urls = conf_urls
+        self.verbosity = verbosity
+
+        #These two are what keep track of what to crawl and what has been.
+        self.not_crawled = [('START',self.base_url)]
+        self.crawled = {}
+
+        self.c = Client(REMOTE_ADDR='127.0.0.1')
+
+        self.plugins = []
+        for plug in Plugin.__subclasses__():
+            active = getattr(plug, 'active', True)
+            if active:
+                self.plugins.append(plug())
+
+
+
+    def get_url(self, from_url, to_url):
+        """
+        Takes a url, and returns it with a list of links
+        This uses the Django test client.
+        """
+        parsed = urlparse.urlparse(to_url)
+        request_dict = dict(cgi.parse_qsl(parsed.query))
+        url_path = parsed.path
+        #url_path now contains the path, request_dict contains get params
+
+        if self.verbosity > 0:
+            print "Getting %s (%s) from (%s)" % (to_url, request_dict, from_url)
+
+        test_signals.pre_request.send(self, url=to_url, request_dict=request_dict)
+        resp = self.c.get(url_path, request_dict)
+        test_signals.post_request.send(self, url=to_url, response=resp)
+        returned_urls = _parse_urls(to_url, resp)
+        test_signals.urls_parsed.send(self, fro=to_url, returned_urls=returned_urls)
+        return (resp, returned_urls)
+
+    def run(self):
+        test_signals.start_run.send(self)
+        while len(self.not_crawled) > 0:
+            #Take top off not_crawled and evaluate it
+            from_url, to_url = self.not_crawled.pop(0)
+            #try:
+            resp, returned_urls = self.get_url(from_url, to_url)
+            """
+            except Exception, e:
+                print "Exception: %s (%s)" % (e, to_url)
+                continue
+            """
+            self.crawled[to_url] = True
+            #Find its links that haven't been crawled
+            for base_url in returned_urls:
+                if base_url not in [to for fro,to in self.not_crawled] and not self.crawled.has_key(base_url):
+                    self.not_crawled.append((to_url, base_url))
+        test_signals.finish_run.send(self)
+
+class Plugin(object):
+    """
+    This is a class to represent a plugin to the Crawler.
+    Subclass it and define a start or stop function to be called on requests.
+    Define a print_report function if your plugin outputs at the end of the run.
+    """
+    global_data = {}
+
+    def __init__(self):
+        if hasattr(self, 'pre_request'):
+            test_signals.pre_request.connect(self.pre_request)
+        if hasattr(self, 'post_request'):
+            test_signals.post_request.connect(self.post_request)
+        if hasattr(self, 'start_run'):
+            test_signals.start_run.connect(self.start_run)
+        if hasattr(self, 'finish_run'):
+            test_signals.finish_run.connect(self.finish_run)
+        if hasattr(self, 'urls_parsed'):
+            test_signals.urls_parsed.connect(self.urls_parsed)
+
+        self.data = self.global_data[self.__class__.__name__] = {}
+
+    """
+    #These functions enable instance['test'] to save to instance.data
+    def __setitem__(self, key, val):
+        self.global_data[self.__class__.__name__][key] = val
+
+    def __getitem__(self, key):
+        return self.global_data[self.__class__.__name__][key]
+    """
+
+class Time(Plugin):
+    """
+    Follow the time it takes to run requests.
+    """
+
+    def __init__(self):
+        super(Time, self).__init__()
+        self.timed_urls = self.data['timed_urls'] = {}
+
+    def pre_request(self, sender, **kwargs):
+        url = kwargs['url']
+        self.timed_urls[url] = time.time()
+
+    def post_request(self, sender, **kwargs):
+        cur = time.time()
+        url = kwargs['url']
+        old_time = self.timed_urls[url]
+        total_time = cur - old_time
+        self.timed_urls[url] = total_time
+        print "Time taken: %s" % self.timed_urls[url]
+
+    def finish_run(self, sender, **kwargs):
+        "Print the longest time it took for pages to load"
+        alist = sorted(self.timed_urls.iteritems(), key=lambda (k,v): (v,k), reverse=True)
+        for url, ttime in alist[:10]:
+            print "%s took %f" % (url, ttime)
+
+class URLConf(Plugin):
+    """
+    Plugin to check validity of URLConf.
+    Run after the spider is done to show what URLConf entries got hit.
+    """
+
+    def finish_run(self, sender, **kwargs):
+        for pattern in sender.conf_urls.keys():
+            pattern = pattern.replace('^', '').replace('$', '').replace('//', '/')
+            curr = re.compile(pattern)
+            matched = False
+            for url in sender.crawled:
+                if curr.search(url):
+                    matched = True
+            if not matched:
+                print "NOT MATCHED: %s" % pattern
+
+class Graph(Plugin):
+    "Make pretty graphs of your requests"
+
+    def __init__(self):
+        super(Graph, self).__init__()
+        self.request_graph = self.data['request_graph'] = {}
+        import pygraphviz
+        self.graph = pygraphviz.AGraph(directed=True)
+
+    def urls_parsed(self, sender, fro, returned_urls, **kwargs):
+        from_node = self.graph.add_node(str(fro), shape='tripleoctagon')
+        for url in returned_urls:
+            if not self.graph.has_node(str(url)):
+                node = self.graph.add_node(str(url))
+            self.graph.add_edge(str(fro), str(url))
+
+    def finish_run(self, sender, **kwargs):
+        import ipdb; ipdb.set_trace()
+        print "Making graph of your URLs, this may take a while"
+        self.graph.layout(prog='dot')
+        self.graph.draw('my_urls.svg')
+
+class Sanitize(Plugin):
+    "Make sure your response is good"
+
+    def post_request(self, sender, **kwargs):
+        soup = BeautifulSoup(kwargs['response'].content)
+        if soup.find(text='&lt;') or soup.find(text='&gt;'):
+            print "%s has dirty html" % url
+
+class Pdb(Plugin):
+    "Run pdb on fail"
+    active = False
+
+    def post_request(self, sender, **kwargs):
+        url = kwargs['url']
+        resp = kwargs['response']
+        if hasattr(resp, 'status_code'):
+            if not resp.status_code in (200,302, 301):
+                print "FAIL: %s, Status Code: %s" % (url, resp.status_code)
+                try:
+                    import ipdb; ipdb.set_trace()
+                except:
+                    import pdb; pdb.set_trace()
diff --git a/test_utils/management/commands/crawlurls.py b/test_utils/management/commands/crawlurls.py
@@ -1,18 +1,9 @@
 from django.conf import settings
 from django.core.management.base import BaseCommand
-try:
-    # 2008-05-30 admindocs found in newforms-admin brand
-    from django.contrib.admindocs.views import extract_views_from_urlpatterns, simplify_regex
-except ImportError:
-    # fall back to trunk, pre-NFA merge
-    from django.contrib.admin.views.doc import extract_views_from_urlpatterns, simplify_regex
-
-from django.test.client import Client
-from django.test.utils import setup_test_environment, teardown_test_environment
-from BeautifulSoup import BeautifulSoup
-import re, cgi, urlparse
+from django.contrib.admindocs.views import extract_views_from_urlpatterns
+
 from optparse import make_option
-from django.test.utils import setup_test_environment
+from test_utils.crawler import Crawler
 
 class Command(BaseCommand):
     option_list = BaseCommand.option_list + (
@@ -32,26 +23,19 @@ class Command(BaseCommand):
             help='TODO: Pass -e NUM to specify how many times each URLConf entry should be hit.'),
     )
 
-    #args = 'app'
     help = "Displays all of the url matching routes for the project."
-
-    requires_model_validation = True
-
+    args = "[relative start url]"
+
     def handle(self, *args, **options):
-
-        USE_PDB = options.get('pdb', False)
-        MAKE_FIXTURES = options.get('fixtures', False)
-        CHECK_HTML = options.get('html', False)
-        CHECK_TIME = options.get('time', False)
-        STORE_RESPONSE = options.get('response', False)
-        VERBOSITY = int(options.get('verbosity', 1))
-        #EACH_URL = options.get('each', 100000)
-
+
+
+        verbosity = int(options.get('verbosity', 1))
+
         if settings.ADMIN_FOR:
             settings_modules = [__import__(m, {}, {}, ['']) for m in settings.ADMIN_FOR]
         else:
             settings_modules = [settings]
-        
+
         conf_urls = {}
         for settings_mod in settings_modules:
             try:
@@ -64,119 +48,8 @@ def handle(self, *args, **options):
                 #Get function name and add it to the hash of URLConf urls
                 func_name = hasattr(func, '__name__') and func.__name__ or repr(func)
                 conf_urls[regex] = ['func.__module__', func_name]
-
-        def dumb_get_url(c, from_url, url, request_dic={}):
-            "Takes a url, and returns it with a list of links"
-            parsed = urlparse.urlparse(url)
-            returned_urls = []
-            if VERBOSITY > 1:
-                print "Getting %s (%s) from (%s)" % (url, request_dic, from_url)
-            time_to_run = ''
-            if CHECK_TIME:
-                resp, time_to_run = time_function(lambda: c.get(url, request_dic))
-            else:
-                resp = c.get(url, request_dic)
 
-            if resp.status_code in [301,302]:
-                hrefs = [resp['location'].replace('http://testserver','')]
-            else:
-                soup = BeautifulSoup(resp.content)
-                if CHECK_HTML:
-                    if soup.find(text='&lt;') or soup.find(text='&gt;'):
-                        print "%s has dirty html" % url
-                hrefs = [a['href'] for a in soup.findAll('a') if a.has_key('href')]
+            #Now we have all of our URLs to test
 
-            for a in hrefs:
-                parsed_href = urlparse.urlparse(a)
-                if parsed_href.path.startswith('/') and not parsed_href.scheme:
-                    returned_urls.append(a)
-                elif not parsed_href.scheme:
-                    #Relative path = previous path + new path
-                    returned_urls.append(parsed.path + a)
-            return (url, resp, time_to_run, returned_urls)
-
-        def run(initial_path):
-            setup_test_environment()
-            c = Client(REMOTE_ADDR='127.0.0.1')
-            not_crawled = [('CLI',initial_path)]
-            already_crawled = {}
-
-            while len(not_crawled) > 0:
-                #Take top off not_crawled and evaluate it
-                from_url, url_target = not_crawled.pop(0)
-                orig_url = url_target
-                parsed = urlparse.urlparse(url_target)
-                request_dic = dict(cgi.parse_qsl(parsed.query))
-                url_target = parsed.path
-                #url now contains the path, request_dic contains get params
-
-                try:
-                    url, resp, time_to_run, returned_urls = dumb_get_url(c, from_url, url_target, request_dic)
-                except Exception, e:
-                    print "Exception: %s (%s)" % (e, url_target)
-                    time_to_run = 0
-                    resp = ''
-                    returned_urls = []
-                    url = 'ERR'
-                if STORE_RESPONSE:
-                    already_crawled[orig_url] = (resp, time_to_run)
-                else:
-                    already_crawled[orig_url] = time_to_run
-                #Get the info on the page
-                if hasattr(resp, 'status_code'):
-                    if not resp.status_code in (200,302, 301):
-                        print "FAIL: %s, Status Code: %s" % (url, resp.status_code)
-                        if USE_PDB:
-                            import pdb
-                            pdb.set_trace()
-                #Find its links
-                for base_url in returned_urls:
-                    if base_url not in [base for orig,base in not_crawled] and not already_crawled.has_key(base_url):
-                        not_crawled.append((orig_url, base_url))
-
-            return already_crawled
-
-        def output_nonmatching(conf_urls, loved_urls):
-            "Run after the spider is done to show what URLConf entries got hit"
-            for pattern in conf_urls.keys():
-                pattern = pattern.replace('^', '').replace('$', '').replace('//', '/')
-                curr = re.compile(pattern)
-                matched = False
-                for url in loved_urls:
-                    if curr.search(url):
-                        matched = True
-                if not matched:
-                    print "NOT MATCHED: %s" % pattern
-
-        def make_fixture(crawled_urls):
-            "Serialize object to keep later"
-            #Not implemented.
-            return crawled_urls.keys()
-
-        def longest_time(crawled_urls):
-            "Print the longest time it took for pages to load"
-            alist = sorted(crawled_urls.iteritems(), key=lambda (k,v): (v,k), reverse=True)
-            for url, time in alist[:10]:
-                print "%s took %f" % (url, time)
-
-        def time_function(func, prnt=True):
-            "Run the function passed in, printing the time elapsed"
-            import time
-            cur = time.time()
-            ret = func()
-            time_to_run = (time.time() - cur)
-            if prnt and VERBOSITY > 1:
-                print "Time Elapsed: %s " % time_to_run
-            return (ret, time_to_run)
-
-
-
-        #Now we have all of our URLs to test
-        crawled_urls = run('/')
-        output_nonmatching(conf_urls, crawled_urls.keys())
-        if CHECK_TIME:
-            longest_time(crawled_urls)
-        if MAKE_FIXTURES:
-            make_fixture(crawled_urls)
-
-
+        c = Crawler('/', conf_urls=conf_urls, verbosity=verbosity)
+        c.run()
diff --git a/test_utils/signals.py b/test_utils/signals.py
@@ -0,0 +1,7 @@
+import django.dispatch
+
+pre_request =  django.dispatch.Signal(providing_args=['url', 'request'])
+post_request =  django.dispatch.Signal(providing_args=['url', 'response'])
+urls_parsed =  django.dispatch.Signal(providing_args=['fro', 'returned_urls'])
+start_run =  django.dispatch.Signal()
+finish_run =  django.dispatch.Signal()