Skip to content

Commit

Permalink
merging new crawl work
Browse files Browse the repository at this point in the history
  • Loading branch information
Kevin Kubasik committed Mar 30, 2009
2 parents ad6a5e9 + c2efc74 commit d8d6fa1
Show file tree
Hide file tree
Showing 3 changed files with 220 additions and 140 deletions.
200 changes: 200 additions & 0 deletions test_utils/crawler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,200 @@
from test_utils import signals as test_signals
from django.test.client import Client
from BeautifulSoup import BeautifulSoup
import re, cgi, urlparse, time

def _parse_urls(url, resp):
parsed = urlparse.urlparse(url)
soup = BeautifulSoup(resp.content)
returned_urls = []
hrefs = [a['href'] for a in soup.findAll('a') if a.has_key('href')]
for a in hrefs:
parsed_href = urlparse.urlparse(a)
if parsed_href.path.startswith('/') and not parsed_href.scheme:
returned_urls.append(a)
elif not parsed_href.scheme:
#Relative path = previous path + new path
returned_urls.append(parsed.path + a)
return returned_urls

class Crawler(object):
"""
This is a class that represents a URL crawler in python
"""
def __init__(self, base_url, conf_urls={}, verbosity=1):
self.base_url = base_url
self.conf_urls = conf_urls
self.verbosity = verbosity

#These two are what keep track of what to crawl and what has been.
self.not_crawled = [('START',self.base_url)]
self.crawled = {}

self.c = Client(REMOTE_ADDR='127.0.0.1')

self.plugins = []
for plug in Plugin.__subclasses__():
active = getattr(plug, 'active', True)
if active:
self.plugins.append(plug())



def get_url(self, from_url, to_url):
"""
Takes a url, and returns it with a list of links
This uses the Django test client.
"""
parsed = urlparse.urlparse(to_url)
request_dict = dict(cgi.parse_qsl(parsed.query))
url_path = parsed.path
#url_path now contains the path, request_dict contains get params

if self.verbosity > 0:
print "Getting %s (%s) from (%s)" % (to_url, request_dict, from_url)

test_signals.pre_request.send(self, url=to_url, request_dict=request_dict)
resp = self.c.get(url_path, request_dict)
test_signals.post_request.send(self, url=to_url, response=resp)
returned_urls = _parse_urls(to_url, resp)
test_signals.urls_parsed.send(self, fro=to_url, returned_urls=returned_urls)
return (resp, returned_urls)

def run(self):
test_signals.start_run.send(self)
while len(self.not_crawled) > 0:
#Take top off not_crawled and evaluate it
from_url, to_url = self.not_crawled.pop(0)
#try:
resp, returned_urls = self.get_url(from_url, to_url)
"""
except Exception, e:
print "Exception: %s (%s)" % (e, to_url)
continue
"""
self.crawled[to_url] = True
#Find its links that haven't been crawled
for base_url in returned_urls:
if base_url not in [to for fro,to in self.not_crawled] and not self.crawled.has_key(base_url):
self.not_crawled.append((to_url, base_url))
test_signals.finish_run.send(self)

class Plugin(object):
"""
This is a class to represent a plugin to the Crawler.
Subclass it and define a start or stop function to be called on requests.
Define a print_report function if your plugin outputs at the end of the run.
"""
global_data = {}

def __init__(self):
if hasattr(self, 'pre_request'):
test_signals.pre_request.connect(self.pre_request)
if hasattr(self, 'post_request'):
test_signals.post_request.connect(self.post_request)
if hasattr(self, 'start_run'):
test_signals.start_run.connect(self.start_run)
if hasattr(self, 'finish_run'):
test_signals.finish_run.connect(self.finish_run)
if hasattr(self, 'urls_parsed'):
test_signals.urls_parsed.connect(self.urls_parsed)

self.data = self.global_data[self.__class__.__name__] = {}

"""
#These functions enable instance['test'] to save to instance.data
def __setitem__(self, key, val):
self.global_data[self.__class__.__name__][key] = val
def __getitem__(self, key):
return self.global_data[self.__class__.__name__][key]
"""

class Time(Plugin):
"""
Follow the time it takes to run requests.
"""

def __init__(self):
super(Time, self).__init__()
self.timed_urls = self.data['timed_urls'] = {}

def pre_request(self, sender, **kwargs):
url = kwargs['url']
self.timed_urls[url] = time.time()

def post_request(self, sender, **kwargs):
cur = time.time()
url = kwargs['url']
old_time = self.timed_urls[url]
total_time = cur - old_time
self.timed_urls[url] = total_time
print "Time taken: %s" % self.timed_urls[url]

def finish_run(self, sender, **kwargs):
"Print the longest time it took for pages to load"
alist = sorted(self.timed_urls.iteritems(), key=lambda (k,v): (v,k), reverse=True)
for url, ttime in alist[:10]:
print "%s took %f" % (url, ttime)

class URLConf(Plugin):
"""
Plugin to check validity of URLConf.
Run after the spider is done to show what URLConf entries got hit.
"""

def finish_run(self, sender, **kwargs):
for pattern in sender.conf_urls.keys():
pattern = pattern.replace('^', '').replace('$', '').replace('//', '/')
curr = re.compile(pattern)
matched = False
for url in sender.crawled:
if curr.search(url):
matched = True
if not matched:
print "NOT MATCHED: %s" % pattern

class Graph(Plugin):
"Make pretty graphs of your requests"

def __init__(self):
super(Graph, self).__init__()
self.request_graph = self.data['request_graph'] = {}
import pygraphviz
self.graph = pygraphviz.AGraph(directed=True)

def urls_parsed(self, sender, fro, returned_urls, **kwargs):
from_node = self.graph.add_node(str(fro), shape='tripleoctagon')
for url in returned_urls:
if not self.graph.has_node(str(url)):
node = self.graph.add_node(str(url))
self.graph.add_edge(str(fro), str(url))

def finish_run(self, sender, **kwargs):
import ipdb; ipdb.set_trace()
print "Making graph of your URLs, this may take a while"
self.graph.layout(prog='dot')
self.graph.draw('my_urls.svg')

class Sanitize(Plugin):
"Make sure your response is good"

def post_request(self, sender, **kwargs):
soup = BeautifulSoup(kwargs['response'].content)
if soup.find(text='<') or soup.find(text='>'):
print "%s has dirty html" % url

class Pdb(Plugin):
"Run pdb on fail"
active = False

def post_request(self, sender, **kwargs):
url = kwargs['url']
resp = kwargs['response']
if hasattr(resp, 'status_code'):
if not resp.status_code in (200,302, 301):
print "FAIL: %s, Status Code: %s" % (url, resp.status_code)
try:
import ipdb; ipdb.set_trace()
except:
import pdb; pdb.set_trace()
153 changes: 13 additions & 140 deletions test_utils/management/commands/crawlurls.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,9 @@
from django.conf import settings
from django.core.management.base import BaseCommand
try:
# 2008-05-30 admindocs found in newforms-admin brand
from django.contrib.admindocs.views import extract_views_from_urlpatterns, simplify_regex
except ImportError:
# fall back to trunk, pre-NFA merge
from django.contrib.admin.views.doc import extract_views_from_urlpatterns, simplify_regex

from django.test.client import Client
from django.test.utils import setup_test_environment, teardown_test_environment
from BeautifulSoup import BeautifulSoup
import re, cgi, urlparse
from django.contrib.admindocs.views import extract_views_from_urlpatterns

from optparse import make_option
from django.test.utils import setup_test_environment
from test_utils.crawler import Crawler

class Command(BaseCommand):
option_list = BaseCommand.option_list + (
Expand All @@ -32,26 +23,19 @@ class Command(BaseCommand):
help='TODO: Pass -e NUM to specify how many times each URLConf entry should be hit.'),
)

#args = 'app'
help = "Displays all of the url matching routes for the project."

requires_model_validation = True

args = "[relative start url]"

def handle(self, *args, **options):

USE_PDB = options.get('pdb', False)
MAKE_FIXTURES = options.get('fixtures', False)
CHECK_HTML = options.get('html', False)
CHECK_TIME = options.get('time', False)
STORE_RESPONSE = options.get('response', False)
VERBOSITY = int(options.get('verbosity', 1))
#EACH_URL = options.get('each', 100000)



verbosity = int(options.get('verbosity', 1))

if settings.ADMIN_FOR:
settings_modules = [__import__(m, {}, {}, ['']) for m in settings.ADMIN_FOR]
else:
settings_modules = [settings]

conf_urls = {}
for settings_mod in settings_modules:
try:
Expand All @@ -64,119 +48,8 @@ def handle(self, *args, **options):
#Get function name and add it to the hash of URLConf urls
func_name = hasattr(func, '__name__') and func.__name__ or repr(func)
conf_urls[regex] = ['func.__module__', func_name]

def dumb_get_url(c, from_url, url, request_dic={}):
"Takes a url, and returns it with a list of links"
parsed = urlparse.urlparse(url)
returned_urls = []
if VERBOSITY > 1:
print "Getting %s (%s) from (%s)" % (url, request_dic, from_url)
time_to_run = ''
if CHECK_TIME:
resp, time_to_run = time_function(lambda: c.get(url, request_dic))
else:
resp = c.get(url, request_dic)

if resp.status_code in [301,302]:
hrefs = [resp['location'].replace('http://testserver','')]
else:
soup = BeautifulSoup(resp.content)
if CHECK_HTML:
if soup.find(text='<') or soup.find(text='>'):
print "%s has dirty html" % url
hrefs = [a['href'] for a in soup.findAll('a') if a.has_key('href')]
#Now we have all of our URLs to test

for a in hrefs:
parsed_href = urlparse.urlparse(a)
if parsed_href.path.startswith('/') and not parsed_href.scheme:
returned_urls.append(a)
elif not parsed_href.scheme:
#Relative path = previous path + new path
returned_urls.append(parsed.path + a)
return (url, resp, time_to_run, returned_urls)

def run(initial_path):
setup_test_environment()
c = Client(REMOTE_ADDR='127.0.0.1')
not_crawled = [('CLI',initial_path)]
already_crawled = {}

while len(not_crawled) > 0:
#Take top off not_crawled and evaluate it
from_url, url_target = not_crawled.pop(0)
orig_url = url_target
parsed = urlparse.urlparse(url_target)
request_dic = dict(cgi.parse_qsl(parsed.query))
url_target = parsed.path
#url now contains the path, request_dic contains get params

try:
url, resp, time_to_run, returned_urls = dumb_get_url(c, from_url, url_target, request_dic)
except Exception, e:
print "Exception: %s (%s)" % (e, url_target)
time_to_run = 0
resp = ''
returned_urls = []
url = 'ERR'
if STORE_RESPONSE:
already_crawled[orig_url] = (resp, time_to_run)
else:
already_crawled[orig_url] = time_to_run
#Get the info on the page
if hasattr(resp, 'status_code'):
if not resp.status_code in (200,302, 301):
print "FAIL: %s, Status Code: %s" % (url, resp.status_code)
if USE_PDB:
import pdb
pdb.set_trace()
#Find its links
for base_url in returned_urls:
if base_url not in [base for orig,base in not_crawled] and not already_crawled.has_key(base_url):
not_crawled.append((orig_url, base_url))

return already_crawled

def output_nonmatching(conf_urls, loved_urls):
"Run after the spider is done to show what URLConf entries got hit"
for pattern in conf_urls.keys():
pattern = pattern.replace('^', '').replace('$', '').replace('//', '/')
curr = re.compile(pattern)
matched = False
for url in loved_urls:
if curr.search(url):
matched = True
if not matched:
print "NOT MATCHED: %s" % pattern

def make_fixture(crawled_urls):
"Serialize object to keep later"
#Not implemented.
return crawled_urls.keys()

def longest_time(crawled_urls):
"Print the longest time it took for pages to load"
alist = sorted(crawled_urls.iteritems(), key=lambda (k,v): (v,k), reverse=True)
for url, time in alist[:10]:
print "%s took %f" % (url, time)

def time_function(func, prnt=True):
"Run the function passed in, printing the time elapsed"
import time
cur = time.time()
ret = func()
time_to_run = (time.time() - cur)
if prnt and VERBOSITY > 1:
print "Time Elapsed: %s " % time_to_run
return (ret, time_to_run)



#Now we have all of our URLs to test
crawled_urls = run('/')
output_nonmatching(conf_urls, crawled_urls.keys())
if CHECK_TIME:
longest_time(crawled_urls)
if MAKE_FIXTURES:
make_fixture(crawled_urls)


c = Crawler('/', conf_urls=conf_urls, verbosity=verbosity)
c.run()
7 changes: 7 additions & 0 deletions test_utils/signals.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
import django.dispatch

pre_request = django.dispatch.Signal(providing_args=['url', 'request'])
post_request = django.dispatch.Signal(providing_args=['url', 'response'])
urls_parsed = django.dispatch.Signal(providing_args=['fro', 'returned_urls'])
start_run = django.dispatch.Signal()
finish_run = django.dispatch.Signal()

0 comments on commit d8d6fa1

Please sign in to comment.