Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

move docs docstrings

  • Loading branch information...
commit 30666d3cc9fa8f192838604c34a73f36f62b14ff 1 parent 48fb3db
Dylan Jay authored
View
117 README.txt
@@ -1,118 +1,7 @@
Crawling - html to import
=========================
-A source blueprint for crawling content from a site or local html files.
-Webcrawler imports HTML either from a live website, for a folder on disk, or a folder
-on disk with html which used to come from a live website and may still have absolute
-links refering to that website.
-
-To crawl a live website supply the crawler with a base http url to start crawling with.
-This url must be the url which all the other urls you want from the site start with.
-
-For example ::
-
- [crawler]
- blueprint = transmogrify.webcrawler
- url = http://www.whitehouse.gov
- max = 50
-
-will restrict the crawler to the first 50 pages.
-
-You can also crawl a local directory of html with relative links by just using a file: style url ::
-
- [crawler]
- blueprint = transmogrify.webcrawler
- url = file:///mydirectory
-
-or if the local directory contains html saved from a website and might have absolute urls in it
-the you can set this as the cache. The crawler will always look up the cache first ::
-
- [crawler]
- blueprint = transmogrify.webcrawler
- url = http://therealsite.com --crawler:cache=mydirectory
-
-The following will not crawl anything larget than 4Mb ::
-
- [crawler]
- blueprint = transmogrify.webcrawler
- url = http://www.whitehouse.gov
- maxsize=400000
-
-To skip crawling links by regular expression ::
-
- [crawler]
- blueprint = transmogrify.webcrawler
- url=http://www.whitehouse.gov
- ignore = \.mp3
- \.mp4
-
-If webcrawler is having trouble parsing the html of some pages you can preprocesses
-the html before it is parsed. e.g. ::
-
- [crawler]
- blueprint = transmogrify.webcrawler
- patterns = (<script>)[^<]*(</script>)
- subs = \1\2
-
-If you'd like to skip processing links with certain mimetypes you can use the
-drop:condition. This TALES expression determines what will be processed further.
-see http://pypi.python.org/pypi/collective.transmogrifier/#condition-section
-::
-
- [drop]
- blueprint = collective.transmogrifier.sections.condition
- condition: python:item.get('_mimetype') not in ['application/x-javascript','text/css','text/plain','application/x-java-byte-code'] and item.get('_path','').split('.')[-1] not in ['class']
-
-
-Options
--------
-
-site_url
- - the top url to crawl
-
-ignore
- - list of regex for urls to not crawl
-
-cache
- - local directory to read crawled items from instead of accessing the site directly
-
-patterns
- - Regular expressions to substitute before html is parsed. New line seperated
-
-subs
- - Text to replace each item in patterns. Must be the same number of lines as patterns. Due to the way buildout handles empty lines, to replace a pattern with nothing (eg to remove the pattern), use ``<EMPTYSTRING>`` as a substitution.
-
-maxsize
- - don't crawl anything larger than this
-
-max
- - Limit crawling to this number of pages
-
-start-urls
- - a list of urls to initially crawl
-
-ignore-robots
- - if set, will ignore the robots.txt directives and crawl everything
-
-WebCrawler will emit items like ::
-
- item = dict(_site_url = "Original site_url used",
- _path = "The url crawled without _site_url,
- _content = "The raw content returned by the url",
- _content_info = "Headers returned with content"
- _backlinks = names,
- _sortorder = "An integer representing the order the url was found within the page/site
- )
-
-
-transmogrify.webcrawler.typerecognitor
-======================================
-
-A blueprint for assinging content type based on the mime-type as given by the
-webcrawler
-
-transmogrify.webcrawler.cache
-=============================
-
-A blueprint that saves crawled content into a directory structure
+`transmogrify.webcrawler` will crawl html to extract pages and files as a source for your transmogrifier pipeline.
+`transmogrify.webcrawler.typerecognitor` aids in setting '_type' based on the crawled mimetype.
+`transmogrify.webcrawler.cache` helps speed up crawling and reduce memory usage by storing items locally.
View
5 docs/HISTORY.txt
@@ -1,6 +1,11 @@
Changelog
=========
+1.2 (2012-12-28)
+----------------
+
+
+
1.1 (2012-04-17)
----------------
View
9 setup.py
@@ -3,10 +3,19 @@
version = '1.2'
+
+def docstring(file):
+ py = open(os.path.join("transmogrify", "siteanalyser", file)).read()
+ return re.findall('"""(.*?)"""', py, re.DOTALL)[0]
+
+
setup(name='transmogrify.webcrawler',
version=version,
description="Crawling and feeding html content into a transmogrifier pipeline",
long_description=open('README.txt').read() + '\n' +
+ docstring('webcrawler.py') + \
+ docstring('staticcreator.py') + \
+ docstring('typerecognitor.py') + \
# open(os.path.join("transmogrify", "webcrawler", "webcrawler.txt")).read() + "\n" +
# open(os.path.join("transmogrify", "webcrawler", "typerecognitor.txt")).read() + "\n" +
open(os.path.join("docs", "HISTORY.txt")).read(),
View
17 transmogrify/webcrawler/staticcreator.py
@@ -20,6 +20,23 @@
_marker = object()
+"""
+transmogrify.webcrawler.cache
+=============================
+
+A blueprint that saves crawled content into a directory structure
+
+Options:
+
+:path-key:
+ Allows you to override the field path is stored in. Defaults to '_path'
+
+:output:
+ Directory to store cached content in
+
+"""
+
+
class StaticCreatorSection(object):
classProvides(ISectionBlueprint)
implements(ISection)
View
10 transmogrify/webcrawler/typerecognitor.py
@@ -10,9 +10,16 @@
from collective.transmogrifier.interfaces import ISectionBlueprint
from collective.transmogrifier.interfaces import ISection
-from transmogrify.webcrawler.external.webchecker import MyURLopener
import logging
+"""
+transmogrify.webcrawler.typerecognitor
+======================================
+
+A blueprint for assigning content type based on the mime-type as given by the
+webcrawler
+"""
+
class TypeRecognitor(object):
classProvides(ISectionBlueprint)
@@ -42,7 +49,6 @@ class TypeRecognitor(object):
def __init__(self, transmogrifier, name, options, previous):
self.previous = previous
- self.open_url = MyURLopener().open
self.logger = logging.getLogger(name)
def __iter__(self):
View
112 transmogrify/webcrawler/webcrawler.py
@@ -20,6 +20,116 @@
from staticcreator import CachingURLopener
from collections import OrderedDict
+"""
+transmogrify.webcrawler
+=======================
+
+A source blueprint for crawling content from a site or local html files.
+
+Webcrawler imports HTML either from a live website, for a folder on disk, or a folder
+on disk with html which used to come from a live website and may still have absolute
+links refering to that website.
+
+To crawl a live website supply the crawler with a base http url to start crawling with.
+This url must be the url which all the other urls you want from the site start with.
+
+For example ::
+
+ [crawler]
+ blueprint = transmogrify.webcrawler
+ url = http://www.whitehouse.gov
+ max = 50
+
+will restrict the crawler to the first 50 pages.
+
+You can also crawl a local directory of html with relative links by just using a file: style url ::
+
+ [crawler]
+ blueprint = transmogrify.webcrawler
+ url = file:///mydirectory
+
+or if the local directory contains html saved from a website and might have absolute urls in it
+the you can set this as the cache. The crawler will always look up the cache first ::
+
+ [crawler]
+ blueprint = transmogrify.webcrawler
+ url = http://therealsite.com --crawler:cache=mydirectory
+
+The following will not crawl anything larget than 4Mb ::
+
+ [crawler]
+ blueprint = transmogrify.webcrawler
+ url = http://www.whitehouse.gov
+ maxsize=400000
+
+To skip crawling links by regular expression ::
+
+ [crawler]
+ blueprint = transmogrify.webcrawler
+ url=http://www.whitehouse.gov
+ ignore = \.mp3
+ \.mp4
+
+If webcrawler is having trouble parsing the html of some pages you can preprocesses
+the html before it is parsed. e.g. ::
+
+ [crawler]
+ blueprint = transmogrify.webcrawler
+ patterns = (<script>)[^<]*(</script>)
+ subs = \1\2
+
+If you'd like to skip processing links with certain mimetypes you can use the
+drop:condition. This TALES expression determines what will be processed further.
+see http://pypi.python.org/pypi/collective.transmogrifier/#condition-section
+::
+
+ [drop]
+ blueprint = collective.transmogrifier.sections.condition
+ condition: python:item.get('_mimetype') not in ['application/x-javascript','text/css','text/plain','application/x-java-byte-code'] and item.get('_path','').split('.')[-1] not in ['class']
+
+
+Options:
+
+:site_url:
+ - the top url to crawl
+
+:ignore:
+ - list of regex for urls to not crawl
+
+:cache:
+ - local directory to read crawled items from instead of accessing the site directly
+
+:patterns:
+ - Regular expressions to substitute before html is parsed. New line seperated
+
+:subs:
+ - Text to replace each item in patterns. Must be the same number of lines as patterns. Due to the way buildout handles empty lines, to replace a pattern with nothing (eg to remove the pattern), use ``<EMPTYSTRING>`` as a substitution.
+
+:maxsize:
+ - don't crawl anything larger than this
+
+:max:
+ - Limit crawling to this number of pages
+
+:start-urls:
+ - a list of urls to initially crawl
+
+:ignore-robots:
+ - if set, will ignore the robots.txt directives and crawl everything
+
+WebCrawler will emit items like ::
+
+ item = dict(_site_url = "Original site_url used",
+ _path = "The url crawled without _site_url,
+ _content = "The raw content returned by the url",
+ _content_info = "Headers returned with content"
+ _backlinks = names,
+ _sortorder = "An integer representing the order the url was found within the page/site
+ )
+
+"""
+
+
VERBOSE = 0 # Verbosity level (0-3)
MAXPAGE = 0 # Ignore files bigger than this
CHECKEXT = False # Check external references (1 deep)
@@ -28,6 +138,8 @@
NONAMES = 0 # Force name anchor checking
+
+
class WebCrawler(object):
classProvides(ISectionBlueprint)
implements(ISection)
Please sign in to comment.
Something went wrong with that request. Please try again.