move docs docstrings

collective · Dec 28, 2012 · 30666d3 · 30666d3
1 parent 48fb3db
commit 30666d3
Show file tree

Hide file tree

Showing 6 changed files with 154 additions and 116 deletions.
diff --git a/README.txt b/README.txt
@@ -1,118 +1,7 @@
 Crawling - html to import
 =========================
-A source blueprint for crawling content from a site or local html files.
 
-Webcrawler imports HTML either from a live website, for a folder on disk, or a folder
-on disk with html which used to come from a live website and may still have absolute
-links refering to that website.
-
-To crawl a live website supply the crawler with a base http url to start crawling with.
-This url must be the url which all the other urls you want from the site start with.
-
-For example ::
-
- [crawler]
- blueprint = transmogrify.webcrawler
- url  = http://www.whitehouse.gov
- max = 50
-
-will restrict the crawler to the first 50 pages.
-
-You can also crawl a local directory of html with relative links by just using a file: style url ::
-
- [crawler]
- blueprint = transmogrify.webcrawler
- url = file:///mydirectory
-
-or if the local directory contains html saved from a website and might have absolute urls in it
-the you can set this as the cache. The crawler will always look up the cache first ::
-
- [crawler]
- blueprint = transmogrify.webcrawler
- url = http://therealsite.com --crawler:cache=mydirectory
-
-The following will not crawl anything larget than 4Mb ::
-
-  [crawler]
-  blueprint = transmogrify.webcrawler
-  url  = http://www.whitehouse.gov
-  maxsize=400000
-
-To skip crawling links by regular expression ::
-
-  [crawler]
-  blueprint = transmogrify.webcrawler
-  url=http://www.whitehouse.gov
-  ignore = \.mp3
-                   \.mp4 
-
-If webcrawler is having trouble parsing the html of some pages you can preprocesses
-the html before it is parsed. e.g. ::
-
-  [crawler]
-  blueprint = transmogrify.webcrawler
-  patterns = (<script>)[^<]*(</script>)
-  subs = \1\2
-
-If you'd like to skip processing links with certain mimetypes you can use the
-drop:condition. This TALES expression determines what will be processed further.
-see http://pypi.python.org/pypi/collective.transmogrifier/#condition-section
-::
-
- [drop]
- blueprint = collective.transmogrifier.sections.condition
- condition: python:item.get('_mimetype') not in ['application/x-javascript','text/css','text/plain','application/x-java-byte-code'] and item.get('_path','').split('.')[-1] not in ['class']
-
-
-Options
--------
-
-site_url
- - the top url to crawl
-
-ignore
- - list of regex for urls to not crawl
-
-cache
- - local directory to read crawled items from instead of accessing the site directly
-
-patterns
- - Regular expressions to substitute before html is parsed. New line seperated
-
-subs
- - Text to replace each item in patterns. Must be the same number of lines as patterns.  Due to the way buildout handles empty lines, to replace a pattern with nothing (eg to remove the pattern), use ``<EMPTYSTRING>`` as a substitution.
-
-maxsize
- - don't crawl anything larger than this
-
-max
- - Limit crawling to this number of pages
-
-start-urls
- - a list of urls to initially crawl
-
-ignore-robots
- - if set, will ignore the robots.txt directives and crawl everything
-
-WebCrawler will emit items like ::
-
- item = dict(_site_url = "Original site_url used",
-            _path = "The url crawled without _site_url,
-            _content = "The raw content returned by the url",
-            _content_info = "Headers returned with content"
-            _backlinks    = names,
-            _sortorder    = "An integer representing the order the url was found within the page/site
-	     )
-
-
-transmogrify.webcrawler.typerecognitor
-======================================
-
-A blueprint for assinging content type based on the mime-type as given by the
-webcrawler
-
-transmogrify.webcrawler.cache
-=============================
-
-A blueprint that saves crawled content into a directory structure
+`transmogrify.webcrawler` will crawl html to extract pages and files as a source for your transmogrifier pipeline.
+`transmogrify.webcrawler.typerecognitor` aids in setting '_type' based on the crawled mimetype.
+`transmogrify.webcrawler.cache` helps speed up crawling and reduce memory usage by storing items locally.
 
diff --git a/docs/HISTORY.txt b/docs/HISTORY.txt
@@ -1,6 +1,11 @@
 Changelog
 =========
 
+1.2 (2012-12-28)
+----------------
+
+
+
 1.1 (2012-04-17)
 ----------------
 

diff --git a/setup.py b/setup.py
@@ -3,10 +3,19 @@
 
 version = '1.2'
 
+
+def docstring(file):
+    py = open(os.path.join("transmogrify", "siteanalyser", file)).read()
+    return re.findall('"""(.*?)"""', py, re.DOTALL)[0]
+
+
 setup(name='transmogrify.webcrawler',
       version=version,
       description="Crawling and feeding html content into a transmogrifier pipeline",
       long_description=open('README.txt').read() + '\n' +
+                        docstring('webcrawler.py') + \
+                        docstring('staticcreator.py') + \
+                        docstring('typerecognitor.py') + \
 #                      open(os.path.join("transmogrify", "webcrawler", "webcrawler.txt")).read() + "\n" +
 #                        open(os.path.join("transmogrify", "webcrawler", "typerecognitor.txt")).read() + "\n" +
                         open(os.path.join("docs", "HISTORY.txt")).read(),

diff --git a/transmogrify/webcrawler/staticcreator.py b/transmogrify/webcrawler/staticcreator.py
@@ -20,6 +20,23 @@
 
 _marker = object()
 
+"""
+transmogrify.webcrawler.cache
+=============================
+
+A blueprint that saves crawled content into a directory structure
+
+Options:
+
+:path-key:
+  Allows you to override the field path is stored in. Defaults to '_path'
+
+:output:
+  Directory to store cached content in
+
+"""
+
+
 class StaticCreatorSection(object):
     classProvides(ISectionBlueprint)
     implements(ISection)

diff --git a/transmogrify/webcrawler/typerecognitor.py b/transmogrify/webcrawler/typerecognitor.py
@@ -10,9 +10,16 @@
 from collective.transmogrifier.interfaces import ISectionBlueprint
 from collective.transmogrifier.interfaces import ISection
 
-from transmogrify.webcrawler.external.webchecker import MyURLopener
 import logging
 
+"""
+transmogrify.webcrawler.typerecognitor
+======================================
+
+A blueprint for assigning content type based on the mime-type as given by the
+webcrawler
+"""
+
 
 class TypeRecognitor(object):
     classProvides(ISectionBlueprint)
@@ -42,7 +49,6 @@ class TypeRecognitor(object):
 
     def __init__(self, transmogrifier, name, options, previous):
         self.previous = previous
-        self.open_url = MyURLopener().open
         self.logger = logging.getLogger(name)
 
     def __iter__(self):

diff --git a/transmogrify/webcrawler/webcrawler.py b/transmogrify/webcrawler/webcrawler.py
@@ -20,6 +20,116 @@
 from staticcreator import CachingURLopener
 from collections import OrderedDict
 
+"""
+transmogrify.webcrawler
+=======================
+
+A source blueprint for crawling content from a site or local html files.
+
+Webcrawler imports HTML either from a live website, for a folder on disk, or a folder
+on disk with html which used to come from a live website and may still have absolute
+links refering to that website.
+
+To crawl a live website supply the crawler with a base http url to start crawling with.
+This url must be the url which all the other urls you want from the site start with.
+
+For example ::
+
+ [crawler]
+ blueprint = transmogrify.webcrawler
+ url  = http://www.whitehouse.gov
+ max = 50
+
+will restrict the crawler to the first 50 pages.
+
+You can also crawl a local directory of html with relative links by just using a file: style url ::
+
+ [crawler]
+ blueprint = transmogrify.webcrawler
+ url = file:///mydirectory
+
+or if the local directory contains html saved from a website and might have absolute urls in it
+the you can set this as the cache. The crawler will always look up the cache first ::
+
+ [crawler]
+ blueprint = transmogrify.webcrawler
+ url = http://therealsite.com --crawler:cache=mydirectory
+
+The following will not crawl anything larget than 4Mb ::
+
+  [crawler]
+  blueprint = transmogrify.webcrawler
+  url  = http://www.whitehouse.gov
+  maxsize=400000
+
+To skip crawling links by regular expression ::
+
+  [crawler]
+  blueprint = transmogrify.webcrawler
+  url=http://www.whitehouse.gov
+  ignore = \.mp3
+                   \.mp4
+
+If webcrawler is having trouble parsing the html of some pages you can preprocesses
+the html before it is parsed. e.g. ::
+
+  [crawler]
+  blueprint = transmogrify.webcrawler
+  patterns = (<script>)[^<]*(</script>)
+  subs = \1\2
+
+If you'd like to skip processing links with certain mimetypes you can use the
+drop:condition. This TALES expression determines what will be processed further.
+see http://pypi.python.org/pypi/collective.transmogrifier/#condition-section
+::
+
+ [drop]
+ blueprint = collective.transmogrifier.sections.condition
+ condition: python:item.get('_mimetype') not in ['application/x-javascript','text/css','text/plain','application/x-java-byte-code'] and item.get('_path','').split('.')[-1] not in ['class']
+
+
+Options:
+
+:site_url:
+ - the top url to crawl
+
+:ignore:
+ - list of regex for urls to not crawl
+
+:cache:
+ - local directory to read crawled items from instead of accessing the site directly
+
+:patterns:
+ - Regular expressions to substitute before html is parsed. New line seperated
+
+:subs:
+ - Text to replace each item in patterns. Must be the same number of lines as patterns.  Due to the way buildout handles empty lines, to replace a pattern with nothing (eg to remove the pattern), use ``<EMPTYSTRING>`` as a substitution.
+
+:maxsize:
+ - don't crawl anything larger than this
+
+:max:
+ - Limit crawling to this number of pages
+
+:start-urls:
+ - a list of urls to initially crawl
+
+:ignore-robots:
+ - if set, will ignore the robots.txt directives and crawl everything
+
+WebCrawler will emit items like ::
+
+ item = dict(_site_url = "Original site_url used",
+            _path = "The url crawled without _site_url,
+            _content = "The raw content returned by the url",
+            _content_info = "Headers returned with content"
+            _backlinks    = names,
+            _sortorder    = "An integer representing the order the url was found within the page/site
+	     )
+
+"""
+
+
 VERBOSE = 0                             # Verbosity level (0-3)
 MAXPAGE = 0                        # Ignore files bigger than this
 CHECKEXT = False    # Check external references (1 deep)
@@ -28,6 +138,8 @@
 NONAMES = 0         # Force name anchor checking
 
 
+
+
 class WebCrawler(object):
     classProvides(ISectionBlueprint)
     implements(ISection)