Skip to content

Commit

Permalink
complete overhaul of FilteringRules to improve their usability: imple…
Browse files Browse the repository at this point in the history
…mented

#48, #60, #64
  • Loading branch information
majid committed Jul 14, 2008
1 parent 633683a commit 55cc933
Show file tree
Hide file tree
Showing 14 changed files with 759 additions and 368 deletions.
8 changes: 7 additions & 1 deletion Makefile
Original file line number Original file line Diff line number Diff line change
Expand Up @@ -21,6 +21,12 @@ sync:
-mv feedparser.py feedparser.old -mv feedparser.py feedparser.old
wget http://diveintomark.org/projects/feed_parser/feedparser.py wget http://diveintomark.org/projects/feed_parser/feedparser.py


JUI= spool/jquery.ui-*[0-9]/ui
js:
vcheck --verbose -d --file etc/vcheck
(cd spool; wget -c http://jqueryjs.googlecode.com/svn/trunk/plugins/form/jquery.form.js)
cat spool/jquery.ui-*[0-9]/jquery-[0-9]*.js spool/jquery.form.js $(JUI)/ui.core.js $(JUI)/ui.dialog.js $(JUI)/ui.tabs.js| jsmin > rsrc/temboz.js
./temboz --kill
changelog: changelog:
cvs2cl.pl --tags -g -q cvs2cl.pl --tags -g -q


Expand Down Expand Up @@ -57,7 +63,7 @@ distclean:


clean: distclean clean: distclean
-rm -rf temboz-$(VERSION) temboz-$(VERSION).tar.gz -rm -rf temboz-$(VERSION) temboz-$(VERSION).tar.gz
-rm -f pages/*.py pages/*.pyc pages/*.pyo pages/*.py.bak -rm -f pages/*.py pages/*.pyc pages/*.pyo pages/*.py.bak *.js


realclean: clean realclean: clean
-rm -rf rss.db -rm -rf rss.db
3 changes: 3 additions & 0 deletions UPGRADE
Original file line number Original file line Diff line number Diff line change
@@ -1,3 +1,6 @@
To 0.9
SQLite 3.5 is required (well, at least 3.1.3 due to "alter table add column"
------------------------------------------------------------------------
From 0.4.4 to 0.5: From 0.4.4 to 0.5:


First kill your running Temboz server. First kill your running Temboz server.
Expand Down
12 changes: 9 additions & 3 deletions ddl.sql
Original file line number Original file line Diff line number Diff line change
Expand Up @@ -16,13 +16,14 @@ create table fm_feeds (
-- 0=hourly, 1=daily, 2=weekly, 3=monthly -- 0=hourly, 1=daily, 2=weekly, 3=monthly
feed_frequency int default 0, feed_frequency int default 0,
feed_auth varchar(255), feed_auth varchar(255),
feed_filter text feed_filter text,
feed_exempt int default 0
); );


create table fm_items ( create table fm_items (
item_uid integer primary key, item_uid integer primary key,
item_guid varchar(255), item_guid varchar(255),
item_feed_uid int, item_feed_uid integer,
-- references fm_feeds (feed_uid) on delete cascade, -- references fm_feeds (feed_uid) on delete cascade,
item_loaded timestamp, item_loaded timestamp,
item_created timestamp, item_created timestamp,
Expand All @@ -34,7 +35,9 @@ create table fm_items (
item_content text, item_content text,
item_creator varchar(255), item_creator varchar(255),
item_rating default 0, item_rating default 0,
item_item_uid int -- to cluster related items together item_item_uid int, -- to cluster related items together
item_rule_uid integer,
-- references fm_rules (rule_uid) on delete cascade,
); );


create trigger update_timestamp after insert on fm_items create trigger update_timestamp after insert on fm_items
Expand All @@ -50,6 +53,9 @@ create index item_title_i on fm_items(item_feed_uid, item_title);


create table fm_rules ( create table fm_rules (
rule_uid integer primary key, rule_uid integer primary key,
rule_type varchar(16) not null default 'python',
rule_feed_uid integer,
-- references fm_feeds (feed_uid) on delete cascade,
rule_expires timestamp, rule_expires timestamp,
rule_text text rule_text text
); );
Expand Down
276 changes: 276 additions & 0 deletions filters.py
Original file line number Original file line Diff line number Diff line change
@@ -0,0 +1,276 @@
# handle the various type of FilteringRules
import time, re, textwrap
import normalize, param, util

rules = []
feed_rules = {}
loaded = False

def evaluate_rules(item, feed, feed_uid, exempt):
for rule in rules * (not exempt) + feed_rules.get(feed_uid, list()):
try:
if rule.test(item, feed, feed_uid):
return True, rule
except:
util.print_stack(['f'])
return False, None

class Rule:
registry = dict()
def __init__(self, uid, expires):
assert uid not in self.registry
self.registry[uid] = self
self.uid = uid
self.expires = expires
def __str__(self):
return '<Rule %s>' % self.uid
def __repr__(self):
return self.__str__()
def check_expires(self):
return self.expires and time.time() > self.expires

class KeywordRule(Rule):
def __init__(self, uid, expires, rule, rtype):
Rule.__init__(self, uid, expires)
self.target, self.match = rtype.split('_', 1)
assert self.target in ['title', 'content']
if self.match in ['word', 'all']:
self.rule = normalize.get_words(rule)
else:
self.rule = rule
def __str__(self):
return '<KeywordRule %s %s %s %s>' % (self.uid, self.target, self.match,
self.rule)
def test(self, item, feed, feed_uid):
if self.check_expires():
return False
if self.match in ['word', 'all']:
suffix = '_words'
elif self.match == 'phrase_lc':
suffix = '_lc'
else:
suffix = ''
target = item[self.target + suffix]
if self.match == 'word':
return bool(target.intersection(self.rule))
elif self.match == 'all':
return bool(target.issuperset(self.rule))
else:
return self.rule in target
def highlight(self, html):
if type(self.rule) in [str, unicode]:
return normalize.replace_first(
html, self.rule,
'<span class="filter-highlight">%s</span>' % self.rule)
else:
for word in self.rule:
html = normalize.replace_first(
html, word,
'<span class="filter-highlight">%s</span>' % word)
return html
def highlight_title(self, html):
if self.target == 'content' \
and self.uid > 0 and -self.uid in self.registry:
return self.highlight(html)
if self.target == 'title':
return self.highlight(html)
return html
def highlight_content(self, html):
if self.target == 'content':
return self.highlight(html)
return html

########################################################################
# functions used inside Python rules
def link_already(url):
from singleton import db
print >> param.log, 'checking for deja-vu for', url,
c = db.cursor()
c.execute("select count(*) from fm_items where item_link like ?",
[url + '%'])
l = c.fetchone()
c.close()
print >> param.log, l and l[0]
return l and l[0]

# shades of LISP...
def curry(fn, obj):
return lambda *args: fn(obj, *args)

# obj can be a string, list or dictionary
def any(obj, *words):
for w in words:
if w in obj:
return True
return False

def union_any(obj_list, *words):
for w in words:
for obj in obj_list:
if w in obj:
return True
return False

########################################################################

rule_comment_re = re.compile('^#.*$', re.MULTILINE)
def normalize_rule(rule):
"""allow embedded CR/LF and comments to make for more readable rules"""
return rule_comment_re.sub('', rule).replace(
'\n', ' ').replace('\r', ' ').strip()

wrapper = textwrap.TextWrapper(width=80, break_long_words=False)
# XXX this relies on texwrap implementation details to prevent wrapping on
# XXX hyphens and em-dashes, only on spaces
wrapper.wordsep_re = re.compile(r'(\s+)')
def rule_lines(rule):
"Find how many lines are needed for the rule in a word-wrapped <textarea>"
lines = 0
for line in rule.splitlines():
if line.strip():
lines += len(wrapper.wrap(line))
else:
lines += 1
return lines

class PythonRule(Rule):
def __init__(self, uid, expires, rule):
Rule.__init__(self, uid, expires)
self.rule = rule
rule = normalize_rule(rule)
self.code = compile(rule, 'rule' + `uid`, 'eval')
def __str__(self):
return '<PythonRule %s %s>' % (self.uid, normalize_rule(self.rule))
def test(self, item, feed, feed_uid):
if self.check_expires():
return False
filter_dict = dict()
for key in feed.feed:
try:
filter_dict['feed_' + key] = feed.feed[key]
except KeyError:
pass
filter_dict.update(item)
# used to filter echos from sites like Digg
filter_dict['link_already'] = link_already
# convenient shortcut functions
filter_dict['title_any_words'] = curry(any, item['title_words'])
filter_dict['content_any_words'] = curry(any, item['content_words'])
filter_dict['union_any_words'] = curry(
union_any, [item['title_words'], item['content_words']])
filter_dict['title_any'] = curry(any, item['title'])
filter_dict['content_any'] = curry(any, item['content'])
filter_dict['union_any'] = curry(
union_any, [item['title'], item['content']])
filter_dict['title_any_lc'] = curry(any, item['title_lc'])
filter_dict['content_any_lc'] = curry(any, item['content_lc'])
filter_dict['union_any_lc'] = curry(
union_any, [item['title_lc'], item['content_lc']])
return bool(eval(self.code, filter_dict))
def highlight_title(self, html):
return html
def highlight_content(self, html):
return html + '<br><p>Filtered by Python rule %d</p>' % self.uid

def load_rules(db, c):
global loaded, rules, feed_rules
if loaded: return
rules = []
feed_rules = dict()
try:
try:
c.execute("""select rule_uid, rule_type, rule_text, rule_feed_uid,
strftime('%s', rule_expires)
from fm_rules
where rule_expires is null or rule_expires > julianday('now')""")
for uid, rtype, rule, feed_uid, expires in c:
if expires: expires = int(expires)
if feed_uid:
container = feed_rules.setdefault(feed_uid, list())
else:
container = rules
if rtype == 'python':
rule = PythonRule(uid, expires, rule)
container.append(rule)
elif rtype.startswith('union_'):
# XXX this convention of adding a second rule object with UID -uid
# XXX is a ugly hack
container.append(KeywordRule(
-uid, expires, rule, rtype.replace('union_', 'title_')))
container.append(KeywordRule(
uid, expires, rule, rtype.replace('union_', 'content_')))
else:
container.append(KeywordRule(uid, expires, rule, rtype))
c.execute("""select feed_uid, feed_filter from fm_feeds
where feed_filter is not null""")
for feed_uid, rule in c:
rule = PythonRule('feed_%d' % feed_uid, None, rule)
feed_rules.setdefault(feed_uid, list()).append(rule)
except:
util.print_stack()
finally:
c.close()
loaded = True

def invalidate():
"""Invalidate the rule cache to force reloading from the database"""
# break cyclic references
Rule.registry.clear()
global loaded
loaded = False

def update_rule(db, c, uid, expires, text, delete):
if expires == 'never':
expires = 'NULL'
else:
expires = "julianday('%s')" % expires
# check syntax
compile(normalize_rule(text), 'web form', 'eval')
if uid == 'new':
c.execute("insert into fm_rules (rule_expires, rule_text) values (?, ?)",
[expires, text])
elif delete == 'on':
c.execute("delete from fm_rules where rule_uid=?", [uid])
else:
c.execute("""update fm_rules set rule_expires=?, rule_text=?
where rule_uid=?""", [expires, text, uid])
db.commit()
invalidate()

def add_kw_rule(db, c, kw=None, item_uid=None, match='word', target='title',
feed_only=False, retroactive=False, **kwargs):
feed_only = bool(feed_only)
retroactive = bool(retroactive)

if feed_only:
item_uid = int(item_uid)
else:
item_uid = None

if not kw: return
if match == 'word':
words = normalize.get_words(kw)
elif match == 'all':
words = [' '.join(normalize.get_words(kw))]
elif match == 'phrase_lc':
words = [normalize.lower(kw)]
elif match == 'phrase':
words = [kw]
else:
return

rule_type = target + '_' + match

for word in words:
print >> param.log, 'ADD_KW_RULES', rule_type, item_uid, word
c.execute("""insert into fm_rules (rule_type, rule_feed_uid, rule_text)
values (?, (select item_feed_uid from fm_items where item_uid=?), ?)""",
[rule_type, item_uid, word]);
invalidate()

def del_kw_rule(db, c, rule_uid=None, **kwargs):
c.execute("""update fm_items
set item_rating=0, item_rule_uid=NULL
where item_rule_uid=?""", [rule_uid])
c.execute('delete from fm_rules where rule_uid=?', [rule_uid])
invalidate()
24 changes: 24 additions & 0 deletions normalize.py
Original file line number Original file line Diff line number Diff line change
Expand Up @@ -212,9 +212,33 @@ def decode_entities(s):


# XXX need to normalize for HTML entities as well # XXX need to normalize for HTML entities as well
def lower(s): def lower(s):
"""Turn a string lower-case, including stripping accents"""
s = unicode(s) s = unicode(s)
return strip_diacritics(decode_entities(s)).translate(lc_map).lower() return strip_diacritics(decode_entities(s)).translate(lc_map).lower()


# XXX this implementation is hopefully correct, but inefficient
# XXX we should be able to replace it with a finite state automaton in C
# XXX for better performance
# tested with u=u'\xe9sop\xe9sopfoo\xe9sop' and unicodedata.normalize('NFD', u)
def replace_first(s, pat, repl):
"""Case-insensitive replacement of the first occurrent of pat in s by repl"""
lc = lower(s)
pat = lower(pat)
start = lc.find(pat)
if start == -1:
return s
else:
# find the beginning of the pattern in the original string
# since we strip accent, the equivalent in the original string may be
# further than in the lower-case version
# i.e. we are assuming that len(lower(s)) <= len(s) for all Unicode s
while not lower(s[start:]).startswith(pat):
start += 1
end = start + len(pat)
while lower(s[start:end]) != pat:
end += 1
return s[:start] + repl + s[end:]

strip_tags_re = re.compile('<[^>]*>') strip_tags_re = re.compile('<[^>]*>')
def get_words(s): def get_words(s):
return set([ return set([
Expand Down
Loading

0 comments on commit 55cc933

Please sign in to comment.