complete overhaul of FilteringRules to improve their usability: imple…

…mented #48, #60, #64
fazalmajid · Jul 14, 2008 · 55cc933 · 55cc933
1 parent 633683a
commit 55cc933
Show file tree

Hide file tree

Showing 14 changed files with 759 additions and 368 deletions.
diff --git a/Makefile b/Makefile
@@ -21,6 +21,12 @@ sync:
 	-mv feedparser.py feedparser.old
 	wget http://diveintomark.org/projects/feed_parser/feedparser.py
 
+JUI=	spool/jquery.ui-*[0-9]/ui
+js:
+	vcheck --verbose -d --file etc/vcheck
+	(cd spool; wget -c http://jqueryjs.googlecode.com/svn/trunk/plugins/form/jquery.form.js)
+	cat spool/jquery.ui-*[0-9]/jquery-[0-9]*.js spool/jquery.form.js $(JUI)/ui.core.js  $(JUI)/ui.dialog.js $(JUI)/ui.tabs.js| jsmin > rsrc/temboz.js
+	./temboz --kill
 changelog:
 	cvs2cl.pl --tags -g -q
 
@@ -57,7 +63,7 @@ distclean:
 
 clean: distclean
 	-rm -rf temboz-$(VERSION) temboz-$(VERSION).tar.gz
-	-rm -f pages/*.py pages/*.pyc pages/*.pyo pages/*.py.bak
+	-rm -f pages/*.py pages/*.pyc pages/*.pyo pages/*.py.bak *.js
 
 realclean: clean
 	-rm -rf rss.db
diff --git a/UPGRADE b/UPGRADE
@@ -1,3 +1,6 @@
+To 0.9
+SQLite 3.5 is required (well, at least 3.1.3 due to "alter table add column"
+------------------------------------------------------------------------
 From 0.4.4 to 0.5:
 
 First kill your running Temboz server.

diff --git a/ddl.sql b/ddl.sql
@@ -16,13 +16,14 @@ create table fm_feeds (
 	-- 0=hourly, 1=daily, 2=weekly, 3=monthly
 	feed_frequency	int default 0,
 	feed_auth	varchar(255),
-	feed_filter	text
+	feed_filter	text,
+	feed_exempt	int default 0
 );
 
 create table fm_items (
 	item_uid	integer primary key,
 	item_guid	varchar(255),
-	item_feed_uid	int,
+	item_feed_uid	integer,
 	-- references fm_feeds (feed_uid) on delete cascade,
 	item_loaded	timestamp,
 	item_created	timestamp,
@@ -34,7 +35,9 @@ create table fm_items (
 	item_content	text,
 	item_creator	varchar(255),
 	item_rating	default 0,
-	item_item_uid	int -- to cluster related items together
+	item_item_uid	int, -- to cluster related items together
+	item_rule_uid	integer,
+	-- references fm_rules (rule_uid) on delete cascade,
 );
 
 create trigger update_timestamp after insert on fm_items
@@ -50,6 +53,9 @@ create index item_title_i on fm_items(item_feed_uid, item_title);
 
 create table fm_rules (
 	rule_uid	integer primary key,
+	rule_type	varchar(16) not null default 'python',
+	rule_feed_uid	integer,
+	-- references fm_feeds (feed_uid) on delete cascade,
 	rule_expires	timestamp,
 	rule_text	text
 );

diff --git a/filters.py b/filters.py
@@ -0,0 +1,276 @@
+# handle the various type of FilteringRules
+import time, re, textwrap
+import normalize, param, util
+
+rules = []
+feed_rules = {}
+loaded = False
+
+def evaluate_rules(item, feed, feed_uid, exempt):
+  for rule in rules * (not exempt) + feed_rules.get(feed_uid, list()):
+    try:
+      if rule.test(item, feed, feed_uid):
+        return True, rule
+    except:
+      util.print_stack(['f'])
+  return False, None
+
+class Rule:
+  registry = dict()
+  def __init__(self, uid, expires):
+    assert uid not in self.registry
+    self.registry[uid] = self
+    self.uid = uid
+    self.expires = expires
+  def __str__(self):
+    return '<Rule %s>' % self.uid
+  def __repr__(self):
+    return self.__str__()
+  def check_expires(self):
+    return self.expires and time.time() > self.expires
+
+class KeywordRule(Rule):
+  def __init__(self, uid, expires, rule, rtype):
+    Rule.__init__(self, uid, expires)
+    self.target, self.match = rtype.split('_', 1)
+    assert self.target in ['title', 'content']
+    if self.match in ['word', 'all']:
+      self.rule = normalize.get_words(rule)
+    else:
+      self.rule = rule
+  def __str__(self):
+    return '<KeywordRule %s %s %s %s>' % (self.uid, self.target, self.match,
+                                          self.rule)
+  def test(self, item, feed, feed_uid):
+    if self.check_expires():
+      return False
+    if self.match in ['word', 'all']:
+      suffix = '_words'
+    elif self.match == 'phrase_lc':
+      suffix = '_lc'
+    else:
+      suffix = ''
+    target = item[self.target + suffix]
+    if self.match == 'word':
+      return bool(target.intersection(self.rule))
+    elif self.match == 'all':
+      return bool(target.issuperset(self.rule))
+    else:
+      return self.rule in target
+  def highlight(self, html):
+    if type(self.rule) in [str, unicode]:
+      return normalize.replace_first(
+        html, self.rule,
+        '<span class="filter-highlight">%s</span>' % self.rule)
+    else:
+      for word in self.rule:
+        html = normalize.replace_first(
+          html, word,
+          '<span class="filter-highlight">%s</span>' % word)
+      return html
+  def highlight_title(self, html):
+    if self.target == 'content' \
+           and self.uid > 0 and -self.uid in self.registry:
+      return  self.highlight(html)
+    if self.target == 'title':
+      return  self.highlight(html)
+    return html
+  def highlight_content(self, html):
+    if self.target == 'content':
+      return  self.highlight(html)
+    return html
+
+########################################################################
+# functions used inside Python rules
+def link_already(url):
+  from singleton import db
+  print >> param.log, 'checking for deja-vu for', url,
+  c = db.cursor()
+  c.execute("select count(*) from fm_items where item_link like ?",
+            [url + '%'])
+  l = c.fetchone()
+  c.close()
+  print >> param.log, l and l[0]
+  return l and l[0]
+
+# shades of LISP...
+def curry(fn, obj):
+  return lambda *args: fn(obj, *args)
+
+# obj can be a string, list or dictionary
+def any(obj, *words):
+  for w in words:
+    if w in obj:
+      return True
+  return False
+
+def union_any(obj_list, *words):
+  for w in words:
+    for obj in obj_list:
+      if w in obj:
+        return True
+  return False
+
+########################################################################
+
+rule_comment_re = re.compile('^#.*$', re.MULTILINE)
+def normalize_rule(rule):
+  """allow embedded CR/LF and comments to make for more readable rules"""
+  return rule_comment_re.sub('', rule).replace(
+    '\n', ' ').replace('\r', ' ').strip()
+
+wrapper = textwrap.TextWrapper(width=80, break_long_words=False)
+# XXX this relies on texwrap implementation details to prevent wrapping on
+# XXX hyphens and em-dashes, only on spaces
+wrapper.wordsep_re = re.compile(r'(\s+)')
+def rule_lines(rule):
+  "Find how many lines are needed for the rule in a word-wrapped <textarea>"
+  lines = 0
+  for line in rule.splitlines():
+    if line.strip():
+      lines += len(wrapper.wrap(line))
+    else:
+      lines += 1
+  return lines
+
+class PythonRule(Rule):
+  def __init__(self, uid, expires, rule):
+    Rule.__init__(self, uid, expires)
+    self.rule = rule
+    rule = normalize_rule(rule)
+    self.code = compile(rule, 'rule' + `uid`, 'eval')
+  def __str__(self):
+    return '<PythonRule %s %s>' % (self.uid, normalize_rule(self.rule))
+  def test(self, item, feed, feed_uid):
+    if self.check_expires():
+      return False
+    filter_dict = dict()
+    for key in feed.feed:
+      try:
+        filter_dict['feed_' + key] = feed.feed[key]
+      except KeyError:
+        pass
+    filter_dict.update(item)
+    # used to filter echos from sites like Digg
+    filter_dict['link_already'] = link_already
+    # convenient shortcut functions
+    filter_dict['title_any_words'] = curry(any, item['title_words'])
+    filter_dict['content_any_words'] = curry(any, item['content_words'])
+    filter_dict['union_any_words'] = curry(
+      union_any, [item['title_words'], item['content_words']])
+    filter_dict['title_any'] = curry(any, item['title'])
+    filter_dict['content_any'] = curry(any, item['content'])
+    filter_dict['union_any'] = curry(
+      union_any, [item['title'], item['content']])
+    filter_dict['title_any_lc'] = curry(any, item['title_lc'])
+    filter_dict['content_any_lc'] = curry(any, item['content_lc'])
+    filter_dict['union_any_lc'] = curry(
+      union_any, [item['title_lc'], item['content_lc']])
+    return bool(eval(self.code, filter_dict))
+  def highlight_title(self, html):
+    return html
+  def highlight_content(self, html):
+    return html + '<br><p>Filtered by Python rule %d</p>' % self.uid
+
+def load_rules(db, c):
+  global loaded, rules, feed_rules
+  if loaded: return
+  rules = []
+  feed_rules = dict()
+  try:
+    try:
+      c.execute("""select rule_uid, rule_type, rule_text, rule_feed_uid,
+      strftime('%s', rule_expires)
+      from fm_rules
+      where rule_expires is null or rule_expires > julianday('now')""")
+      for uid, rtype, rule, feed_uid, expires in c:
+        if expires: expires = int(expires)
+        if feed_uid:
+          container = feed_rules.setdefault(feed_uid, list())
+        else:
+          container = rules
+        if rtype == 'python':
+          rule = PythonRule(uid, expires, rule)
+          container.append(rule)
+        elif rtype.startswith('union_'):
+          # XXX this convention of adding a second rule object with UID -uid
+          # XXX is a ugly hack
+          container.append(KeywordRule(
+            -uid, expires, rule, rtype.replace('union_', 'title_')))
+          container.append(KeywordRule(
+            uid, expires, rule, rtype.replace('union_', 'content_')))
+        else:
+          container.append(KeywordRule(uid, expires, rule, rtype))
+      c.execute("""select feed_uid, feed_filter from fm_feeds
+      where feed_filter is not null""")
+      for feed_uid, rule in c:
+        rule = PythonRule('feed_%d' % feed_uid, None, rule)
+        feed_rules.setdefault(feed_uid, list()).append(rule)
+    except:
+      util.print_stack()
+  finally:
+    c.close()
+    loaded = True
+
+def invalidate():
+  """Invalidate the rule cache to force reloading from the database"""
+  # break cyclic references
+  Rule.registry.clear()
+  global loaded
+  loaded = False
+
+def update_rule(db, c, uid, expires, text, delete):
+  if expires == 'never':
+    expires = 'NULL'
+  else:
+    expires = "julianday('%s')" % expires
+  # check syntax
+  compile(normalize_rule(text), 'web form', 'eval')
+  if uid == 'new':
+    c.execute("insert into fm_rules (rule_expires, rule_text) values (?, ?)",
+              [expires, text])
+  elif delete == 'on':
+    c.execute("delete from fm_rules where rule_uid=?", [uid])
+  else:
+    c.execute("""update fm_rules set rule_expires=?, rule_text=?
+    where rule_uid=?""", [expires, text, uid])
+  db.commit()
+  invalidate()
+
+def add_kw_rule(db, c, kw=None, item_uid=None, match='word', target='title',
+                feed_only=False, retroactive=False, **kwargs):
+  feed_only = bool(feed_only)
+  retroactive = bool(retroactive)
+
+  if feed_only:
+    item_uid = int(item_uid)
+  else:
+    item_uid = None
+
+  if not kw: return
+  if match == 'word':
+    words = normalize.get_words(kw)
+  elif match == 'all':
+    words = [' '.join(normalize.get_words(kw))]
+  elif match == 'phrase_lc':
+    words = [normalize.lower(kw)]
+  elif match == 'phrase':
+      words = [kw]
+  else:
+    return
+
+  rule_type = target + '_' + match
+
+  for word in words:
+    print >> param.log, 'ADD_KW_RULES', rule_type, item_uid, word
+    c.execute("""insert into fm_rules (rule_type, rule_feed_uid, rule_text)
+    values (?, (select item_feed_uid from fm_items where item_uid=?), ?)""",
+              [rule_type, item_uid, word]);
+  invalidate()
+
+def del_kw_rule(db, c, rule_uid=None, **kwargs):
+  c.execute("""update fm_items
+  set item_rating=0, item_rule_uid=NULL
+  where item_rule_uid=?""", [rule_uid])
+  c.execute('delete from fm_rules where rule_uid=?', [rule_uid])
+  invalidate()
diff --git a/normalize.py b/normalize.py
@@ -212,9 +212,33 @@ def decode_entities(s):
 
 # XXX need to normalize for HTML entities as well
 def lower(s):
+  """Turn a string lower-case, including stripping accents"""
   s = unicode(s)
   return strip_diacritics(decode_entities(s)).translate(lc_map).lower()
 
+# XXX this implementation is hopefully correct, but inefficient
+# XXX we should be able to replace it with a finite state automaton in C
+# XXX for better performance
+# tested with u=u'\xe9sop\xe9sopfoo\xe9sop' and unicodedata.normalize('NFD', u)
+def replace_first(s, pat, repl):
+  """Case-insensitive replacement of the first occurrent of pat in s by repl"""
+  lc = lower(s)
+  pat = lower(pat)
+  start = lc.find(pat)
+  if start == -1:
+    return s
+  else:
+    # find the beginning of the pattern in the original string
+    # since we strip accent, the equivalent in the original string may be
+    # further than in the lower-case version
+    # i.e. we are assuming that len(lower(s)) <= len(s) for all Unicode s
+    while not lower(s[start:]).startswith(pat):
+      start += 1
+    end = start + len(pat)
+    while lower(s[start:end]) != pat:
+      end += 1
+    return s[:start] + repl + s[end:]
+
 strip_tags_re = re.compile('<[^>]*>')
 def get_words(s):
   return set([