Skip to content

Commit

Permalink
Add attribute macthing by path.
Browse files Browse the repository at this point in the history
  • Loading branch information
ivanteoh committed Oct 10, 2012
1 parent 9a4e04e commit 30e1be1
Showing 1 changed file with 120 additions and 69 deletions.
189 changes: 120 additions & 69 deletions transmogrify/htmlcontentextractor/templatefinder.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,15 @@

import fnmatch
#import fnmatch
from zope.interface import classProvides
from zope.interface import implements
from collective.transmogrifier.interfaces import ISectionBlueprint
from collective.transmogrifier.interfaces import ISection
from collective.transmogrifier.utils import Matcher
#from collective.transmogrifier.utils import Matcher

from webstemmer.analyze import PageFeeder, LayoutAnalyzer, LayoutCluster
from webstemmer.extract import TextExtractor, LayoutPatternSet, LayoutPattern
from webstemmer.layoutils import sigchars, get_textblocks, retrieve_blocks, WEBSTEMMER_VERSION, KEY_ATTRS
from webstemmer.zipdb import ACLDB
from webstemmer.htmldom import parse
#from webstemmer.analyze import PageFeeder, LayoutAnalyzer, LayoutCluster
#from webstemmer.extract import TextExtractor, LayoutPatternSet, LayoutPattern
#from webstemmer.layoutils import sigchars, get_textblocks, retrieve_blocks, WEBSTEMMER_VERSION, KEY_ATTRS
#from webstemmer.zipdb import ACLDB
#from webstemmer.htmldom import parse
from lxml import etree
import lxml.html
import lxml.html.soupparser
Expand All @@ -19,13 +18,11 @@
import datetime
from collections import OrderedDict

from StringIO import StringIO
from sys import stderr
#from StringIO import StringIO
#from sys import stderr

import logging



"""
XPath Tests
===========
Expand Down Expand Up @@ -58,19 +55,20 @@
"""


ns = {'re':"http://exslt.org/regular-expressions"}
ns = {'re': "http://exslt.org/regular-expressions"}

import re
attr = re.compile(r':(?P<attr>[^/:]*)=(?P<val>[^/:]*)')


def toXPath(pat):
#td:valign=top/p:class=msonormal/span
pat = attr.sub(r'[re:test(@\g<attr>,"^\g<val>$","i")]', pat)
pat = pat.replace('/', '//')
return "//" + pat

default_charset = 'utf-8'

default_charset='utf-8'

class TemplateFinder(object):
classProvides(ISectionBlueprint)
Expand All @@ -88,16 +86,17 @@ class TemplateFinder(object):
2-title = html //h2
"""



def __init__(self, transmogrifier, name, options, previous):
self.previous = previous
self.groups = {}
self.name = name
self.logger = logging.getLogger(name)
order = options.get('_order','').split()
order = options.get('_order', '').split()
self.match = options.get('_match', '/').strip()
self.apply_to_paths = options.get('_apply_to_paths', '').strip()

def specialkey(key):
if key in ['blueprint','debug','_order']:
if key in ['blueprint', 'debug', '_order', '_match', '_apply_to_paths']:
return True
if key in order:
return True
Expand All @@ -112,44 +111,98 @@ def specialkey(key):
group, field = key.split('-', 1)
group = int(group)
except:
group, field = '1',key
group, field = '1', key
xps = []
res = re.findall("(?m)^(text|html|optional|delete|tal|optionaltext|optionalhtml)\s(.*)$", value)
if not res:
format,value = 'html',value
format, value = 'html', value
else:
format,value = res[0]
format, value = res[0]
for line in value.strip().split('\n'):
xp = line.strip()
if format.lower() == 'tal':
xp = Expression(xp, transmogrifier, name, options, datetime=datetime)
xps.append((format,xp))
xps.append((format, xp))
group = self.groups.setdefault(group, OrderedDict())
group[field] = xps

def __iter__(self):
iteration = self.attribute_to_item()
if self.apply_to_paths:
iteration = self.attribute_to_paths()
return iteration

def attribute_to_item(self):
"""Process items applying attributes to current item"""
return self.process_items(self.previous)

def attribute_to_paths(self):
"""Process items applying attributes to different item based on _apply_to_paths
Optionally splitting content by _match
"""
site_items = []
collected_pseudo_items = {}

for item in self.previous:
site_items.append(item)
content = self.getHtml(item)
if content is None:
continue

def __iter__(self):
tree = lxml.html.fromstring(content)
nodes = tree.xpath(self.match, namespaces=ns)

# item attribute: _site_url, content, mimetype
for node in nodes:
child_content = etree.tostring(node)
child_tree = lxml.html.fromstring(child_content)
child_paths = child_tree.xpath(self.apply_to_paths, namespaces=ns)
#child_branch = node.xpath(self.apply_to_paths, namespaces=ns)
pseudo_item = dict(_path=item["_path"], _mimetype="text/html", _content=child_content, _site_url=item["_site_url"])
results = self.process_items([pseudo_item])
children = []

for result in results:
children.append(result)

if children and len(children) == 1 and '_template' in children[0]:
for child_path in child_paths:
# TODO: Better path normalization, eg: http://example.com/123.asp
if child_path.startswith("/"):
collected_pseudo_items[child_path.strip("/")] = children[0]

# {"/topics/2030.asp": {'description': 'smoking', ...}}
# [{"title": "smoking", "_path" : "/topics/2030.asp", ...}, ...]
for item in site_items:
path = item.get("_path", "")
if path in collected_pseudo_items:
#import pdb; pdb.set_trace()
item.update(collected_pseudo_items[path])
yield item

def process_items(self, items):
"""Process items from basic template"""
notextracted = []
total = 0
skipped = 0
alreadymatched = 0
stats = {}
for item in self.previous:
for item in items:
#import pdb; pdb.set_trace()
total += 1
content = self.getHtml(item)
path = item.get('_path','')
path = item.get('_path', '')
if content is None:
#log.warning('(%s) content is None'%item['_path'])
skipped += 1
if path:
self.logger.debug("SKIP: %s (no html)"%(path))
self.logger.debug("SKIP: %s (no html)" % (path))
yield item
continue
if '_template' in item:
# don't apply the template if another has already been applied
alreadymatched += 1
self.logger.debug("SKIP: %s (already extracted)"%(item['_path']))
self.logger.debug("SKIP: %s (already extracted)" % (item['_path']))
yield item
continue
path = item['_site_url'] + item['_path']
Expand All @@ -167,18 +220,18 @@ def __iter__(self):
else:
notextracted.append(item)
yield item
# for item in notextracted:
# yield item
self.logger.info("extracted %d/%d/%d/%d %s"%(total-len(notextracted)-alreadymatched-skipped,
total-alreadymatched-skipped,
total-skipped,
total, stats))

# for item in notextracted:
# yield item
self.logger.info("extracted %d/%d/%d/%d %s" % (total - len(notextracted) - alreadymatched - skipped,
total - alreadymatched - skipped,
total - skipped,
total, stats))

def extract(self, pats, tree, item, stats):
unique = OrderedDict()
nomatch = []
optional = []
#import pdb; pdb.set_trace()
for field, xps in pats.items():
if field == 'path':
continue
Expand All @@ -189,27 +242,27 @@ def extract(self, pats, tree, item, stats):
#treat special so normal node ops still work
xp = xp.strip()[:-7]
if format.lower().endswith('html'):
format = format.lower()[:-4]+'text'
format = format.lower()[:-4] + 'text'
elif format.lower().startswith('optional'):
format = 'optionaltext'
nodes = tree.xpath(xp, namespaces=ns)
if not nodes:
if format.lower().startswith('optional'):
optional.append( (field, xp))
optional.append((field, xp))
else:
nomatch.append( (field,xp) )
self.logger.debug("FAIL %s:%s=%s %s\n%s"%(item['_path'],
nomatch.append((field, xp))
self.logger.debug("FAIL %s:%s=%s %s\n%s" % (item['_path'],
field, format, xp,
etree.tostring(tree, method='html', encoding=unicode)))
continue

nodes = [(format, n) for n in nodes]
unique[field] = nonoverlap(unique.setdefault(field,[]), nodes)
unique[field] = nonoverlap(unique.setdefault(field, []), nodes)
if nomatch:
matched = [field for field in unique.keys()]
unmatched = [field for field, xp in nomatch]
self.logger.info( "FAIL: '%s' matched=%s, unmatched=%s" % (item['_path'],
matched, unmatched) )
self.logger.info("FAIL: '%s' matched=%s, unmatched=%s" % (item['_path'],
matched, unmatched))
return False
extracted = {}
assert unique
Expand All @@ -222,19 +275,19 @@ def extract(self, pats, tree, item, stats):
continue
if not node.getparent():
# already dropped
toremove.append((format,node))
toremove.append((format, node))
continue
try:
node.drop_tree()
except:
self.logger.error("error in drop_tree %s=%s"%(field,etree.tostring(node, method='html', encoding=unicode)))
self.logger.error("error in drop_tree %s=%s" % (field, etree.tostring(node, method='html', encoding=unicode)))
for node in toremove:
nodes.remove(node)

for field, nodes in unique.items():
for format, node in nodes:
extracted.setdefault(field,'')
format = format.lower().replace('optional','')
extracted.setdefault(field, '')
format = format.lower().replace('optional', '')
if format in ['delete']:
continue
if not getattr(node, 'iterancestors', None):
Expand All @@ -255,7 +308,7 @@ def extract(self, pats, tree, item, stats):
# lxml.html.fragment_fromstring(html)
# except lxml.etree.ParserError:
# extracted[field] = html

item.update(extracted)

#match tal format
Expand All @@ -270,27 +323,27 @@ def extract(self, pats, tree, item, stats):
extracted[field] = extracted.get(field, '') + value
item.update(extracted)


unmatched = set([field for field,xp in optional])
unmatched = set([field for field, xp in optional])
matched = set(unique.keys()) - set(unmatched)
for field in matched:
stats[field] = stats.get(field,0) + 1
self.logger.info( "PASS: '%s' matched=%s, unmatched=%s", item['_path'], list(matched) , list(unmatched))
stats[field] = stats.get(field, 0) + 1
self.logger.info("PASS: '%s' matched=%s, unmatched=%s", item['_path'], list(matched), list(unmatched))
if '_tree' in item:
del item['_tree']
item['_template'] = None
return item

def getHtml(self, item):
path = item.get('_path', None)
content = item.get('_content', None) or item.get('text', None)
mimetype = item.get('_mimetype', None)
if path is not None and \
content is not None and \
mimetype in ['text/xhtml', 'text/html']:
return content
else:
return None
"""Return the right html content based on attribute and mimetype"""
path = item.get('_path', None)
content = item.get('_content', None) or item.get('text', None)
mimetype = item.get('_mimetype', None)
if path is not None and \
content is not None and \
mimetype in ['text/xhtml', 'text/html']:
return content
else:
return None


def ancestors(e):
Expand All @@ -302,23 +355,21 @@ def ancestors(e):


def nonoverlap(unique, new):
""" return the elements which aren't descentants of each other """
for format,e1 in new:
"""Return the elements which aren't descentants of each other"""
for format, e1 in new:
#if e1 is an ascendant then replace
add = True
toremove = []
for f,e in unique:
for f, e in unique:
if e1 == e:
toremove.append((f,e))
toremove.append((f, e))
elif e1 in set(ancestors(e)):
toremove.append((f,e))
toremove.append((f, e))
elif e in set(ancestors(e1)):
add = False
break
if add:
unique.append((format,e1))
unique.append((format, e1))
for pair in toremove:
unique.remove(pair)
return unique


0 comments on commit 30e1be1

Please sign in to comment.