Permalink
Browse files

implemented #67 - article URLs in feeds are dereferenced to get rid of

Feedburner at al.
  • Loading branch information...
majid
majid committed Jun 6, 2008
1 parent 7ba0ef7 commit 2a553e8bc5057e3b51330debf276a6b5dfe74e72
Showing with 105 additions and 7 deletions.
  1. +48 −0 etc/norm_url.py
  2. +47 −3 normalize.py
  3. +10 −4 update.py
View
@@ -0,0 +1,48 @@
import sys, os, threading, Queue
sys.path.append('.')
os.chdir('..')
import normalize
from singleton import db
num_workers = 64
in_q = Queue.Queue()
out_q = Queue.Queue()
class Worker(threading.Thread):
def run(self):
while True:
uid, url = in_q.get()
if uid is None:
out_q.put((None, None, None))
return
new_url = normalize.dereference(url)
if url != new_url:
out_q.put((uid, url, new_url))
workers = []
for i in range(num_workers):
workers.append(Worker())
workers[-1].setDaemon(True)
workers[-1].start()
c = db.cursor()
c.execute("""select item_uid, item_link
from fm_items
where item_rating>0
order by item_uid""")
map(in_q.put, c)
map(in_q.put, [(None, None)] * num_workers)
while True:
uid, url, new_url = out_q.get()
if uid is None and url is None and new_url is None:
num_workers -= 1
if num_workers == 0:
db.commit()
sys.exit(0)
continue
print uid, url
print '\t==>', new_url
c.execute('update fm_items set item_link=? where item_uid=?',
[new_url, uid])
View
@@ -1,7 +1,7 @@
# -*- coding: iso-8859-1 -*-
import sys, time, re, codecs, string, traceback, md5, HTMLParser
import unicodedata, htmlentitydefs
import feedparser, transform, util, param
import sys, time, re, codecs, string, traceback, md5, socket, HTMLParser
import unicodedata, htmlentitydefs, urllib2, urlparse
import feedparser, param, transform, util
# XXX TODO
#
@@ -259,6 +259,50 @@ def fix_date(date_tuple):
else:
return date_tuple
# code to dereference URLs and follow redirects
class Redirect(Exception):
def __init__(self, code, url):
self.code = code
self.url = url
class DontHandleRedirect(urllib2.HTTPRedirectHandler):
"""Override redirect handling to not dereference redirects for testing"""
def http_error_302(self, req, fp, code, msg, headers):
if 'location' in headers:
newurl = headers.getheaders('location')[0]
elif 'uri' in headers:
newurl = headers.getheaders('uri')[0]
else:
return
newurl = urlparse.urljoin(req.get_full_url(), newurl)
raise Redirect(code, newurl)
http_error_301 = http_error_303 = http_error_307 = http_error_302
redirect_opener = urllib2.build_opener(DontHandleRedirect)
socket.setdefaulttimeout(10)
def dereference(url, seen=None):
"""Recursively dereference a URL"""
# this set is used to detect redirection loops
if seen is None:
seen = set([url])
else:
seen.add(url)
try:
url_obj = redirect_opener.open(url)
# no redirect occurred
return url
except (urllib2.URLError, ValueError):
return url
except Redirect, e:
# break a redirection loop if it occurs
if e.url in seen:
return url
# there might be several levels of redirection
return dereference(e.url, seen)
except:
util.print_stack()
return url
# Balance HTML opening and closing tags
class Balancer(HTMLParser.HTMLParser):
"""Detect unbalanced HTML tags"""
tags = set(['b', 'strong', 'strike', 'em', 'i', 'font', 'a', 'p',
View
@@ -269,6 +269,7 @@ def hard_purge(feed_uid):
c = db.cursor()
try:
c.execute("delete from fm_items where item_feed_uid=?", [feed_uid])
c.execute("delete from fm_rules where rule_feed_uid=?", [feed_uid])
c.execute("delete from fm_feeds where feed_uid=?", [feed_uid])
db.commit()
finally:
@@ -491,6 +492,9 @@ def process_parsed_feed(f, c, feed_uid, feed_dupcheck=None):
# XXX update item here
# GUID doesn't exist yet, insert it
if not l:
# finally, dereference the URL to get rid of annoying tracking servers
# like feedburner, but only do this once to avoid wasting bandwidth
link = normalize.dereference(link)
try:
c.execute("""insert into fm_items (item_feed_uid, item_guid,
item_created, item_modified, item_viewed, item_link, item_md5hex,
@@ -651,6 +655,7 @@ def rule_lines(rule):
lines += 1
return lines
# XXX it is wasteful to keep reloading and recompiling the rules
def load_rules():
global rules
global feed_rules
@@ -659,11 +664,12 @@ def load_rules():
from singleton import db
c = db.cursor()
try:
c.execute("""select rule_uid, rule_text from fm_rules
c.execute("""select rule_uid, rule_type, rule_text from fm_rules
where rule_expires is null or rule_expires > julianday('now')""")
for uid, rule in c:
rule = normalize_rule(rule)
rules.append(compile(rule, 'rule' + `uid`, 'eval'))
for uid, rtype, rule in c:
if rtype == 'python':
rule = normalize_rule(rule)
rules.append(compile(rule, 'rule' + `uid`, 'eval'))
c.execute("""select feed_uid, feed_filter from fm_feeds
where feed_filter is not null""")
for uid, rule in c:

0 comments on commit 2a553e8

Please sign in to comment.