-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
complete overhaul of FilteringRules to improve their usability: imple…
- Loading branch information
majid
committed
Jul 14, 2008
1 parent
633683a
commit 55cc933
Showing
14 changed files
with
759 additions
and
368 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Original file line | Diff line number | Diff line change |
---|---|---|---|
@@ -0,0 +1,276 @@ | |||
# handle the various type of FilteringRules | |||
import time, re, textwrap | |||
import normalize, param, util | |||
|
|||
rules = [] | |||
feed_rules = {} | |||
loaded = False | |||
|
|||
def evaluate_rules(item, feed, feed_uid, exempt): | |||
for rule in rules * (not exempt) + feed_rules.get(feed_uid, list()): | |||
try: | |||
if rule.test(item, feed, feed_uid): | |||
return True, rule | |||
except: | |||
util.print_stack(['f']) | |||
return False, None | |||
|
|||
class Rule: | |||
registry = dict() | |||
def __init__(self, uid, expires): | |||
assert uid not in self.registry | |||
self.registry[uid] = self | |||
self.uid = uid | |||
self.expires = expires | |||
def __str__(self): | |||
return '<Rule %s>' % self.uid | |||
def __repr__(self): | |||
return self.__str__() | |||
def check_expires(self): | |||
return self.expires and time.time() > self.expires | |||
|
|||
class KeywordRule(Rule): | |||
def __init__(self, uid, expires, rule, rtype): | |||
Rule.__init__(self, uid, expires) | |||
self.target, self.match = rtype.split('_', 1) | |||
assert self.target in ['title', 'content'] | |||
if self.match in ['word', 'all']: | |||
self.rule = normalize.get_words(rule) | |||
else: | |||
self.rule = rule | |||
def __str__(self): | |||
return '<KeywordRule %s %s %s %s>' % (self.uid, self.target, self.match, | |||
self.rule) | |||
def test(self, item, feed, feed_uid): | |||
if self.check_expires(): | |||
return False | |||
if self.match in ['word', 'all']: | |||
suffix = '_words' | |||
elif self.match == 'phrase_lc': | |||
suffix = '_lc' | |||
else: | |||
suffix = '' | |||
target = item[self.target + suffix] | |||
if self.match == 'word': | |||
return bool(target.intersection(self.rule)) | |||
elif self.match == 'all': | |||
return bool(target.issuperset(self.rule)) | |||
else: | |||
return self.rule in target | |||
def highlight(self, html): | |||
if type(self.rule) in [str, unicode]: | |||
return normalize.replace_first( | |||
html, self.rule, | |||
'<span class="filter-highlight">%s</span>' % self.rule) | |||
else: | |||
for word in self.rule: | |||
html = normalize.replace_first( | |||
html, word, | |||
'<span class="filter-highlight">%s</span>' % word) | |||
return html | |||
def highlight_title(self, html): | |||
if self.target == 'content' \ | |||
and self.uid > 0 and -self.uid in self.registry: | |||
return self.highlight(html) | |||
if self.target == 'title': | |||
return self.highlight(html) | |||
return html | |||
def highlight_content(self, html): | |||
if self.target == 'content': | |||
return self.highlight(html) | |||
return html | |||
|
|||
######################################################################## | |||
# functions used inside Python rules | |||
def link_already(url): | |||
from singleton import db | |||
print >> param.log, 'checking for deja-vu for', url, | |||
c = db.cursor() | |||
c.execute("select count(*) from fm_items where item_link like ?", | |||
[url + '%']) | |||
l = c.fetchone() | |||
c.close() | |||
print >> param.log, l and l[0] | |||
return l and l[0] | |||
|
|||
# shades of LISP... | |||
def curry(fn, obj): | |||
return lambda *args: fn(obj, *args) | |||
|
|||
# obj can be a string, list or dictionary | |||
def any(obj, *words): | |||
for w in words: | |||
if w in obj: | |||
return True | |||
return False | |||
|
|||
def union_any(obj_list, *words): | |||
for w in words: | |||
for obj in obj_list: | |||
if w in obj: | |||
return True | |||
return False | |||
|
|||
######################################################################## | |||
|
|||
rule_comment_re = re.compile('^#.*$', re.MULTILINE) | |||
def normalize_rule(rule): | |||
"""allow embedded CR/LF and comments to make for more readable rules""" | |||
return rule_comment_re.sub('', rule).replace( | |||
'\n', ' ').replace('\r', ' ').strip() | |||
|
|||
wrapper = textwrap.TextWrapper(width=80, break_long_words=False) | |||
# XXX this relies on texwrap implementation details to prevent wrapping on | |||
# XXX hyphens and em-dashes, only on spaces | |||
wrapper.wordsep_re = re.compile(r'(\s+)') | |||
def rule_lines(rule): | |||
"Find how many lines are needed for the rule in a word-wrapped <textarea>" | |||
lines = 0 | |||
for line in rule.splitlines(): | |||
if line.strip(): | |||
lines += len(wrapper.wrap(line)) | |||
else: | |||
lines += 1 | |||
return lines | |||
|
|||
class PythonRule(Rule): | |||
def __init__(self, uid, expires, rule): | |||
Rule.__init__(self, uid, expires) | |||
self.rule = rule | |||
rule = normalize_rule(rule) | |||
self.code = compile(rule, 'rule' + `uid`, 'eval') | |||
def __str__(self): | |||
return '<PythonRule %s %s>' % (self.uid, normalize_rule(self.rule)) | |||
def test(self, item, feed, feed_uid): | |||
if self.check_expires(): | |||
return False | |||
filter_dict = dict() | |||
for key in feed.feed: | |||
try: | |||
filter_dict['feed_' + key] = feed.feed[key] | |||
except KeyError: | |||
pass | |||
filter_dict.update(item) | |||
# used to filter echos from sites like Digg | |||
filter_dict['link_already'] = link_already | |||
# convenient shortcut functions | |||
filter_dict['title_any_words'] = curry(any, item['title_words']) | |||
filter_dict['content_any_words'] = curry(any, item['content_words']) | |||
filter_dict['union_any_words'] = curry( | |||
union_any, [item['title_words'], item['content_words']]) | |||
filter_dict['title_any'] = curry(any, item['title']) | |||
filter_dict['content_any'] = curry(any, item['content']) | |||
filter_dict['union_any'] = curry( | |||
union_any, [item['title'], item['content']]) | |||
filter_dict['title_any_lc'] = curry(any, item['title_lc']) | |||
filter_dict['content_any_lc'] = curry(any, item['content_lc']) | |||
filter_dict['union_any_lc'] = curry( | |||
union_any, [item['title_lc'], item['content_lc']]) | |||
return bool(eval(self.code, filter_dict)) | |||
def highlight_title(self, html): | |||
return html | |||
def highlight_content(self, html): | |||
return html + '<br><p>Filtered by Python rule %d</p>' % self.uid | |||
|
|||
def load_rules(db, c): | |||
global loaded, rules, feed_rules | |||
if loaded: return | |||
rules = [] | |||
feed_rules = dict() | |||
try: | |||
try: | |||
c.execute("""select rule_uid, rule_type, rule_text, rule_feed_uid, | |||
strftime('%s', rule_expires) | |||
from fm_rules | |||
where rule_expires is null or rule_expires > julianday('now')""") | |||
for uid, rtype, rule, feed_uid, expires in c: | |||
if expires: expires = int(expires) | |||
if feed_uid: | |||
container = feed_rules.setdefault(feed_uid, list()) | |||
else: | |||
container = rules | |||
if rtype == 'python': | |||
rule = PythonRule(uid, expires, rule) | |||
container.append(rule) | |||
elif rtype.startswith('union_'): | |||
# XXX this convention of adding a second rule object with UID -uid | |||
# XXX is a ugly hack | |||
container.append(KeywordRule( | |||
-uid, expires, rule, rtype.replace('union_', 'title_'))) | |||
container.append(KeywordRule( | |||
uid, expires, rule, rtype.replace('union_', 'content_'))) | |||
else: | |||
container.append(KeywordRule(uid, expires, rule, rtype)) | |||
c.execute("""select feed_uid, feed_filter from fm_feeds | |||
where feed_filter is not null""") | |||
for feed_uid, rule in c: | |||
rule = PythonRule('feed_%d' % feed_uid, None, rule) | |||
feed_rules.setdefault(feed_uid, list()).append(rule) | |||
except: | |||
util.print_stack() | |||
finally: | |||
c.close() | |||
loaded = True | |||
|
|||
def invalidate(): | |||
"""Invalidate the rule cache to force reloading from the database""" | |||
# break cyclic references | |||
Rule.registry.clear() | |||
global loaded | |||
loaded = False | |||
|
|||
def update_rule(db, c, uid, expires, text, delete): | |||
if expires == 'never': | |||
expires = 'NULL' | |||
else: | |||
expires = "julianday('%s')" % expires | |||
# check syntax | |||
compile(normalize_rule(text), 'web form', 'eval') | |||
if uid == 'new': | |||
c.execute("insert into fm_rules (rule_expires, rule_text) values (?, ?)", | |||
[expires, text]) | |||
elif delete == 'on': | |||
c.execute("delete from fm_rules where rule_uid=?", [uid]) | |||
else: | |||
c.execute("""update fm_rules set rule_expires=?, rule_text=? | |||
where rule_uid=?""", [expires, text, uid]) | |||
db.commit() | |||
invalidate() | |||
|
|||
def add_kw_rule(db, c, kw=None, item_uid=None, match='word', target='title', | |||
feed_only=False, retroactive=False, **kwargs): | |||
feed_only = bool(feed_only) | |||
retroactive = bool(retroactive) | |||
|
|||
if feed_only: | |||
item_uid = int(item_uid) | |||
else: | |||
item_uid = None | |||
|
|||
if not kw: return | |||
if match == 'word': | |||
words = normalize.get_words(kw) | |||
elif match == 'all': | |||
words = [' '.join(normalize.get_words(kw))] | |||
elif match == 'phrase_lc': | |||
words = [normalize.lower(kw)] | |||
elif match == 'phrase': | |||
words = [kw] | |||
else: | |||
return | |||
|
|||
rule_type = target + '_' + match | |||
|
|||
for word in words: | |||
print >> param.log, 'ADD_KW_RULES', rule_type, item_uid, word | |||
c.execute("""insert into fm_rules (rule_type, rule_feed_uid, rule_text) | |||
values (?, (select item_feed_uid from fm_items where item_uid=?), ?)""", | |||
[rule_type, item_uid, word]); | |||
invalidate() | |||
|
|||
def del_kw_rule(db, c, rule_uid=None, **kwargs): | |||
c.execute("""update fm_items | |||
set item_rating=0, item_rule_uid=NULL | |||
where item_rule_uid=?""", [rule_uid]) | |||
c.execute('delete from fm_rules where rule_uid=?', [rule_uid]) | |||
invalidate() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.