Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ different domains.

eg. ::

HAYSTACK\_STATIC\_PAGES = (
HAYSTACK_STATIC_PAGES = (
'static-about_us', # A named url
'static-help', # Another named url
'http://www.example.com/some_page.html', # A fully qualified url
Expand Down Expand Up @@ -62,5 +62,6 @@ There are currently two command line options that can be used with the
that may not necessarily be known for placing in settings.py
-u URLS, --urls=URLS This allows the user to include additional actual urls
that may not necessarily be known for placing in settings.py


-s, --strip=True Strip tags from the fetched HTML, and only keeps the text content
Can also be set by specifying HAYSTACK_STATIC_PAGES_STRIP_HTML=True
in your ``settings.py``
31 changes: 27 additions & 4 deletions haystack_static_pages/management/commands/crawl_static_pages.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,30 @@
from django.utils import translation
from django.utils.html import escape
from optparse import make_option

from BeautifulSoup import BeautifulSoup

from HTMLParser import HTMLParser
from haystack_static_pages.models import StaticPage

class MLStripper(HTMLParser):
def __init__(self):
self.reset()
self.fed = []
def handle_data(self, d):
self.fed.append(d)
def get_data(self):
return ''.join(self.fed)


def strip_tags(html):
s = MLStripper()
s.feed(html)
return s.get_data()


def list_callback(option, opt, value, parser):
setattr(parser.values, option.dest, value.split(','))


class Command(BaseCommand):
option_list = BaseCommand.option_list + (
make_option('-p', '--port', action='store', dest='port', default=None,
Expand All @@ -25,11 +41,13 @@ class Command(BaseCommand):
help='List of named urls to be indexed (in addition to HAYSTACK_STATIC_PAGES)'),
make_option('-u', '--urls', type='string', action='callback', callback=list_callback,
help='List of actual urls to be indexed (in addition to HAYSTACK_STATIC_PAGES)'),
make_option('-s', '--strip', action='store_true', dest='strip_html', default=False,
help='Strip HTML tags prior to saving the page'),
)
help = 'Setup static pages defined in HAYSTACK_STATIC_PAGES for indexing by Haystack'
cmd = 'crawl_static_pages [-p PORT] [-l LANG] [-u LIST OF URLs]'

def handle(self, *args, **options):
cmd = 'crawl_static_pages [-p PORT] [-l LANG] [-u LIST OF URLs]'
if args:
raise CommandError('Usage is: %s' % cmd)

Expand Down Expand Up @@ -101,7 +119,12 @@ def handle(self, *args, **options):
else:
page.description = ''
page.language = soup.html.get('lang', u'en-US')
page.content = soup.prettify()
if options.get('strip_html') or hasattr( settings, 'HAYSTACK_STATIC_PAGES_STRIP_HTML') and settings.HAYSTACK_STATIC_PAGES_STRIP_HTML:
# remove inline javascript
[s.extract() for s in soup('script')]
page.content = strip_tags(unicode(soup.body))
else:
page.content = soup.prettify()
page.save()
count += 1

Expand Down