Add attribute macthing by path.

collective · Oct 10, 2012 · 30e1be1 · 30e1be1
1 parent 9a4e04e
commit 30e1be1
Showing 1 changed file with 120 additions and 69 deletions.
diff --git a/transmogrify/htmlcontentextractor/templatefinder.py b/transmogrify/htmlcontentextractor/templatefinder.py
@@ -1,16 +1,15 @@
-
-import fnmatch
+#import fnmatch
 from zope.interface import classProvides
 from zope.interface import implements
 from collective.transmogrifier.interfaces import ISectionBlueprint
 from collective.transmogrifier.interfaces import ISection
-from collective.transmogrifier.utils import Matcher
+#from collective.transmogrifier.utils import Matcher
 
-from webstemmer.analyze import PageFeeder, LayoutAnalyzer, LayoutCluster
-from webstemmer.extract import TextExtractor, LayoutPatternSet, LayoutPattern
-from webstemmer.layoutils import sigchars, get_textblocks, retrieve_blocks, WEBSTEMMER_VERSION, KEY_ATTRS
-from webstemmer.zipdb import ACLDB
-from webstemmer.htmldom import parse
+#from webstemmer.analyze import PageFeeder, LayoutAnalyzer, LayoutCluster
+#from webstemmer.extract import TextExtractor, LayoutPatternSet, LayoutPattern
+#from webstemmer.layoutils import sigchars, get_textblocks, retrieve_blocks, WEBSTEMMER_VERSION, KEY_ATTRS
+#from webstemmer.zipdb import ACLDB
+#from webstemmer.htmldom import parse
 from lxml import etree
 import lxml.html
 import lxml.html.soupparser
@@ -19,13 +18,11 @@
 import datetime
 from collections import OrderedDict
 
-from StringIO import StringIO
-from sys import stderr
+#from StringIO import StringIO
+#from sys import stderr
 
 import logging
 
-
-
 """
 XPath Tests
 ===========
@@ -58,19 +55,20 @@
 
 """
 
-
-ns = {'re':"http://exslt.org/regular-expressions"}
+ns = {'re': "http://exslt.org/regular-expressions"}
 
 import re
 attr = re.compile(r':(?P<attr>[^/:]*)=(?P<val>[^/:]*)')
+
+
 def toXPath(pat):
     #td:valign=top/p:class=msonormal/span
     pat = attr.sub(r'[re:test(@\g<attr>,"^\g<val>$","i")]', pat)
     pat = pat.replace('/', '//')
     return "//" + pat
 
+default_charset = 'utf-8'
 
-default_charset='utf-8'
 
 class TemplateFinder(object):
     classProvides(ISectionBlueprint)
@@ -88,16 +86,17 @@ class TemplateFinder(object):
     2-title = html //h2
     """
 
-
-
     def __init__(self, transmogrifier, name, options, previous):
         self.previous = previous
         self.groups = {}
         self.name = name
         self.logger = logging.getLogger(name)
-        order = options.get('_order','').split()
+        order = options.get('_order', '').split()
+        self.match = options.get('_match', '/').strip()
+        self.apply_to_paths = options.get('_apply_to_paths', '').strip()
+
         def specialkey(key):
-            if key in ['blueprint','debug','_order']:
+            if key in ['blueprint', 'debug', '_order', '_match', '_apply_to_paths']:
                 return True
             if key in order:
                 return True
@@ -112,44 +111,98 @@ def specialkey(key):
                 group, field = key.split('-', 1)
                 group = int(group)
             except:
-                group, field = '1',key
+                group, field = '1', key
             xps = []
             res = re.findall("(?m)^(text|html|optional|delete|tal|optionaltext|optionalhtml)\s(.*)$", value)
             if not res:
-                format,value = 'html',value           
+                format, value = 'html', value
             else:
-                format,value = res[0]
+                format, value = res[0]
             for line in value.strip().split('\n'):
                 xp = line.strip()
                 if format.lower() == 'tal':
                     xp = Expression(xp, transmogrifier, name, options, datetime=datetime)
-                xps.append((format,xp))
+                xps.append((format, xp))
             group = self.groups.setdefault(group, OrderedDict())
             group[field] = xps
 
+    def __iter__(self):
+        iteration = self.attribute_to_item()
+        if self.apply_to_paths:
+            iteration = self.attribute_to_paths()
+        return iteration
+
+    def attribute_to_item(self):
+        """Process items applying attributes to current item"""
+        return self.process_items(self.previous)
+
+    def attribute_to_paths(self):
+        """Process items applying attributes to different item based on _apply_to_paths
+           Optionally splitting content by _match
+        """
+        site_items = []
+        collected_pseudo_items = {}
 
+        for item in self.previous:
+            site_items.append(item)
+            content = self.getHtml(item)
+            if content is None:
+                continue
 
-    def __iter__(self):
+            tree = lxml.html.fromstring(content)
+            nodes = tree.xpath(self.match, namespaces=ns)
+
+            # item attribute: _site_url, content, mimetype
+            for node in nodes:
+                child_content = etree.tostring(node)
+                child_tree = lxml.html.fromstring(child_content)
+                child_paths = child_tree.xpath(self.apply_to_paths, namespaces=ns)
+                #child_branch = node.xpath(self.apply_to_paths, namespaces=ns)
+                pseudo_item = dict(_path=item["_path"], _mimetype="text/html", _content=child_content, _site_url=item["_site_url"])
+                results = self.process_items([pseudo_item])
+                children = []
+
+                for result in results:
+                    children.append(result)
+
+                if children and len(children) == 1 and '_template' in children[0]:
+                    for child_path in child_paths:
+                        # TODO: Better path normalization, eg: http://example.com/123.asp
+                        if child_path.startswith("/"):
+                            collected_pseudo_items[child_path.strip("/")] = children[0]
+
+        # {"/topics/2030.asp": {'description': 'smoking', ...}}
+        # [{"title": "smoking", "_path" : "/topics/2030.asp", ...}, ...]
+        for item in site_items:
+            path = item.get("_path", "")
+            if path in collected_pseudo_items:
+                #import pdb; pdb.set_trace()
+                item.update(collected_pseudo_items[path])
+            yield item
+
+    def process_items(self, items):
+        """Process items from basic template"""
         notextracted = []
         total = 0
         skipped = 0
         alreadymatched = 0
         stats = {}
-        for item in self.previous:
+        for item in items:
+            #import pdb; pdb.set_trace()
             total += 1
             content = self.getHtml(item)
-            path = item.get('_path','')
+            path = item.get('_path', '')
             if content is None:
                 #log.warning('(%s) content is None'%item['_path'])
                 skipped += 1
                 if path:
-                    self.logger.debug("SKIP: %s (no html)"%(path))
+                    self.logger.debug("SKIP: %s (no html)" % (path))
                 yield item
                 continue
             if '_template' in item:
                 # don't apply the template if another has already been applied
                 alreadymatched += 1
-                self.logger.debug("SKIP: %s (already extracted)"%(item['_path']))
+                self.logger.debug("SKIP: %s (already extracted)" % (item['_path']))
                 yield item
                 continue
             path = item['_site_url'] + item['_path']
@@ -167,18 +220,18 @@ def __iter__(self):
             else:
                 notextracted.append(item)
                 yield item
-#        for item in notextracted:
-#            yield item
-        self.logger.info("extracted %d/%d/%d/%d %s"%(total-len(notextracted)-alreadymatched-skipped,
-                                                  total-alreadymatched-skipped,
-                                                  total-skipped,
-                                                  total, stats))
-
+            #        for item in notextracted:
+            #            yield item
+        self.logger.info("extracted %d/%d/%d/%d %s" % (total - len(notextracted) - alreadymatched - skipped,
+                                                       total - alreadymatched - skipped,
+                                                       total - skipped,
+                                                       total, stats))
 
     def extract(self, pats, tree, item, stats):
         unique = OrderedDict()
         nomatch = []
         optional = []
+        #import pdb; pdb.set_trace()
         for field, xps in pats.items():
             if field == 'path':
                 continue
@@ -189,27 +242,27 @@ def extract(self, pats, tree, item, stats):
                     #treat special so normal node ops still work
                     xp = xp.strip()[:-7]
                     if format.lower().endswith('html'):
-                        format = format.lower()[:-4]+'text'
+                        format = format.lower()[:-4] + 'text'
                     elif format.lower().startswith('optional'):
                         format = 'optionaltext'
                 nodes = tree.xpath(xp, namespaces=ns)
                 if not nodes:
                     if format.lower().startswith('optional'):
-                        optional.append( (field, xp))
+                        optional.append((field, xp))
                     else:
-                        nomatch.append( (field,xp) )
-                        self.logger.debug("FAIL %s:%s=%s %s\n%s"%(item['_path'],
+                        nomatch.append((field, xp))
+                        self.logger.debug("FAIL %s:%s=%s %s\n%s" % (item['_path'],
                                                         field, format, xp,
                                                         etree.tostring(tree, method='html', encoding=unicode)))
                         continue
-                
+
                 nodes = [(format, n) for n in nodes]
-                unique[field] = nonoverlap(unique.setdefault(field,[]), nodes)
+                unique[field] = nonoverlap(unique.setdefault(field, []), nodes)
         if nomatch:
             matched = [field for field in unique.keys()]
             unmatched = [field for field, xp in nomatch]
-            self.logger.info( "FAIL: '%s' matched=%s, unmatched=%s" % (item['_path'],
-                                                             matched, unmatched) )
+            self.logger.info("FAIL: '%s' matched=%s, unmatched=%s" % (item['_path'],
+                                                             matched, unmatched))
             return False
         extracted = {}
         assert unique
@@ -222,19 +275,19 @@ def extract(self, pats, tree, item, stats):
                     continue
                 if not node.getparent():
                     # already dropped
-                    toremove.append((format,node))
+                    toremove.append((format, node))
                     continue
                 try:
                     node.drop_tree()
                 except:
-                    self.logger.error("error in drop_tree %s=%s"%(field,etree.tostring(node, method='html', encoding=unicode)))
+                    self.logger.error("error in drop_tree %s=%s" % (field, etree.tostring(node, method='html', encoding=unicode)))
             for node in toremove:
                 nodes.remove(node)
 
         for field, nodes in unique.items():
             for format, node in nodes:
-                extracted.setdefault(field,'')
-                format = format.lower().replace('optional','')
+                extracted.setdefault(field, '')
+                format = format.lower().replace('optional', '')
                 if format in ['delete']:
                     continue
                 if not getattr(node, 'iterancestors', None):
@@ -255,7 +308,7 @@ def extract(self, pats, tree, item, stats):
         #            lxml.html.fragment_fromstring(html)
         #        except lxml.etree.ParserError:
         #            extracted[field] = html
-       
+
         item.update(extracted)
 
         #match tal format
@@ -270,27 +323,27 @@ def extract(self, pats, tree, item, stats):
                 extracted[field] = extracted.get(field, '') + value
         item.update(extracted)
 
-
-        unmatched = set([field for field,xp in optional])
+        unmatched = set([field for field, xp in optional])
         matched = set(unique.keys()) - set(unmatched)
         for field in matched:
-            stats[field] = stats.get(field,0) + 1
-        self.logger.info( "PASS: '%s' matched=%s, unmatched=%s", item['_path'], list(matched) , list(unmatched))
+            stats[field] = stats.get(field, 0) + 1
+        self.logger.info("PASS: '%s' matched=%s, unmatched=%s", item['_path'], list(matched), list(unmatched))
         if '_tree' in item:
             del item['_tree']
         item['_template'] = None
         return item
 
     def getHtml(self, item):
-              path = item.get('_path', None)
-              content = item.get('_content', None) or item.get('text', None)
-              mimetype = item.get('_mimetype', None)
-              if  path is not None and \
-                    content is not None and \
-                    mimetype in ['text/xhtml', 'text/html']:
-                  return content
-              else:
-                  return None
+        """Return the right html content based on attribute and mimetype"""
+        path = item.get('_path', None)
+        content = item.get('_content', None) or item.get('text', None)
+        mimetype = item.get('_mimetype', None)
+        if path is not None and \
+           content is not None and \
+           mimetype in ['text/xhtml', 'text/html']:
+            return content
+        else:
+            return None
 
 
 def ancestors(e):
@@ -302,23 +355,21 @@ def ancestors(e):
 
 
 def nonoverlap(unique, new):
-    """ return the elements which aren't descentants of each other """
-    for format,e1 in new:
+    """Return the elements which aren't descentants of each other"""
+    for format, e1 in new:
         #if e1 is an ascendant then replace
         add = True
         toremove = []
-        for f,e in unique:
+        for f, e in unique:
             if e1 == e:
-                toremove.append((f,e))
+                toremove.append((f, e))
             elif e1 in set(ancestors(e)):
-                toremove.append((f,e))
+                toremove.append((f, e))
             elif e in set(ancestors(e1)):
                 add = False
                 break
         if add:
-            unique.append((format,e1))
+            unique.append((format, e1))
         for pair in toremove:
             unique.remove(pair)
     return unique
-
-