Skip to content

Commit

Permalink
- better handling when extracting from a listing and links haven't be…
Browse files Browse the repository at this point in the history
…en crawled

- better error fro TAL, and inlcude DateTime and re
  • Loading branch information
djay committed Jun 23, 2014
1 parent 7648226 commit 792baf5
Showing 1 changed file with 21 additions and 4 deletions.
25 changes: 21 additions & 4 deletions transmogrify/htmlcontentextractor/templatefinder.py
Expand Up @@ -9,13 +9,15 @@
import lxml.etree
from collective.transmogrifier.utils import Expression
import datetime
from DateTime import DateTime
try:
from collections import OrderedDict
except ImportError:
# python 2.6 or earlier, use backport
from ordereddict import OrderedDict
import logging
import urlparse
import sys

"""
transmogrify.htmlcontentextractor
Expand Down Expand Up @@ -90,6 +92,8 @@
ns = {'re': "http://exslt.org/regular-expressions"}
attr = re.compile(r':(?P<attr>[^/:]*)=(?P<val>[^/:]*)')

class TALException(Exception):
pass

def toXPath(pat):
#td:valign=top/p:class=msonormal/span
Expand Down Expand Up @@ -161,7 +165,7 @@ def specialkey(key):
for line in value.strip().split('\n'):
xp = line.strip()
if format.lower() == 'tal':
xp = Expression(xp, transmogrifier, name, options, datetime=datetime)
xp = Expression(xp, transmogrifier, name, options, datetime=datetime, DateTime=DateTime)
xps.append((format, xp))
group = self.groups.setdefault(group, OrderedDict())
group[field] = xps
Expand Down Expand Up @@ -220,15 +224,21 @@ def __iter__(self):
repeated = [tree]

gotit = False
uncrawled_targets = 0
for fragment in repeated:
# get each target_item in the path selection and process with fragment_content
if self.repeat:
target_url = None
target_item = None
for target_url in fragment.xpath(self.url, namespaces=ns):
target_url = urlparse.urljoin(base, target_url.strip("/"))
if target_url in site_items_lookup:
target_item = site_items_lookup[target_url]
break
if target_item is None:
# we haven't crawled the target page so can't set the
# metadata.
uncrawled_targets += 1
continue
else:
target_item = item
path = target_item['_path']
Expand All @@ -250,6 +260,8 @@ def __iter__(self):
if not gotit:
#one of the repeats didn't match so we stop processing item
break
if uncrawled_targets:
self.logger.info("SKIP: %s (can't apply metadata to %s not crawled urls)" % (item['_path'], uncrawled_targets))

if not gotit:
notextracted.append(item)
Expand Down Expand Up @@ -305,7 +317,7 @@ def extract(self, pats, tree, item, stats):
for format, node in nodes:
if getattr(node, 'drop_tree', None) is None:
continue
if not node.getparent():
if node.getparent() is None:
# already dropped
toremove.append((format, node))
continue
Expand All @@ -332,6 +344,7 @@ def extract(self, pats, tree, item, stats):
extracted[field] += value
else:
extracted[field] += etree.tostring(node, method='html', encoding=unicode)
self.logger.debug("EXTRACTED: %s=%s" % (field, extracted[field]))
# What was this code for?
#for field, nodes in unique.items():
# for format, node in nodes:
Expand All @@ -351,8 +364,12 @@ def extract(self, pats, tree, item, stats):
for format, tal in xps:
if format.lower() != 'tal':
continue
value = tal(item, re=re)
try:
value = tal(item, re=re)
except Exception, e:
raise TALException("%s tal caused %s"% (field,str(e))), None, sys.exc_info()[2]
extracted[field] = extracted.get(field, '') + value
self.logger.debug("EXTRACTED: %s=%s" % (field, extracted[field]))
for field, tal in self.tal:
value = tal(item, re=re)
extracted[field] = value
Expand Down

0 comments on commit 792baf5

Please sign in to comment.