Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
tree: c3653ffede
Fetching contributors…

Cannot retrieve contributors at this time

374 lines (307 sloc) 13.025 kb
#!/usr/bin/python
import sys
import subprocess
import urllib2
import httplib
import os
import errno
import models
import re
from datetime import datetime, timedelta
import traceback
import sqlalchemy
# Different versions of BeautifulSoup have different properties.
# Some work with one site, some with another.
# This is BeautifulSoup 3.2.
from BeautifulSoup import BeautifulSoup
# This is BeautifulSoup 4
import bs4
DIFF_DIR = 'articles/'
GIT_DIR = DIFF_DIR + '.git'
DATE_FORMAT = '%B %d, %Y at %l:%M%P EDT'
# Begin utility functions
# subprocess.check_output appeared in python 2.7.
# Linerva only has 2.6
def check_output(*popenargs, **kwargs):
r"""Run command with arguments and return its output as a byte string.
If the exit code was non-zero it raises a CalledProcessError. The
CalledProcessError object will have the return code in the returncode
attribute and output in the output attribute.
The arguments are the same as for the Popen constructor. Example:
>>> check_output(["ls", "-l", "/dev/null"])
'crw-rw-rw- 1 root root 1, 3 Oct 18 2007 /dev/null\n'
The stdout argument is not allowed as it is used internally.
To capture standard error in the result, use stderr=STDOUT.
>>> check_output(["/bin/sh", "-c",
... "ls -l non_existent_file ; exit 0"],
... stderr=STDOUT)
'ls: non_existent_file: No such file or directory\n'
"""
from subprocess import PIPE, CalledProcessError, Popen
if 'stdout' in kwargs:
raise ValueError('stdout argument not allowed, it will be overridden.')
process = Popen(stdout=PIPE, *popenargs, **kwargs)
output, unused_err = process.communicate()
retcode = process.poll()
if retcode:
cmd = kwargs.get("args")
if cmd is None:
cmd = popenargs[0]
raise CalledProcessError(retcode, cmd, output=output)
return output
if not hasattr(subprocess, 'check_output'):
subprocess.check_output = check_output
def mkdir_p(path):
try:
os.makedirs(path)
except OSError as exc: # Python >2.5
if exc.errno == errno.EEXIST:
pass
else:
raise
# Begin hot patch for https://bugs.launchpad.net/bugs/788986
# Ick.
def bs_fixed_getText(self, separator=u""):
bsmod = sys.modules[BeautifulSoup.__module__]
if not len(self.contents):
return u""
stopNode = self._lastRecursiveChild().next
strings = []
current = self.contents[0]
while current is not stopNode:
if isinstance(current, bsmod.NavigableString):
strings.append(current)
current = current.next
return separator.join(strings)
sys.modules[BeautifulSoup.__module__].Tag.getText = bs_fixed_getText
# End fix
def canonicalize_url(url):
return url.split('?')[0].split('#')[0].strip()
def strip_prefix(string, prefix):
if string.startswith(prefix):
string = string[len(prefix):]
return string
def url_to_filename(url):
return strip_prefix(url, 'http://').rstrip('/')
def grab_url(url):
text = urllib2.urlopen(url).read()
# Occasionally need to retry
if '<title>NY Times Advertisement</title>' in text:
return grab_url(url)
return text
def strip_whitespace(text):
lines = text.split('\n')
return '\n'.join(x.strip().rstrip(u'\xa0') for x in lines).strip() + '\n'
# End utility functions
feeders = [('http://www.nytimes.com/',
lambda url: 'www.nytimes.com/201' in url),
('http://edition.cnn.com/',
lambda url: 'edition.cnn.com/201' in url),
('http://www.politico.com/',
lambda url: 'www.politico.com/news/stories' in url,
bs4.BeautifulSoup),
]
#Article urls for a single website
def find_article_urls(feeder_url, filter_article, SoupVersion=BeautifulSoup):
html = grab_url(feeder_url)
soup = SoupVersion(html)
# "or ''" to make None into str
urls = [a.get('href') or '' for a in soup.findAll('a')]
domain = os.path.dirname(feeder_url)
urls = [url if '://' in url else domain + url for url in urls]
return [url for url in urls if filter_article(url)]
def get_all_article_urls():
ans = set()
for feeder in feeders:
urls = find_article_urls(*feeder)
ans = ans.union(map(canonicalize_url, urls))
return ans
#Parser for NYT articles
#also used as a base class for other parsers; probably should be split
class Article(object):
url = None
title = None
date = None
body = None
real_article = True
meta = []
SUFFIX = '?pagewanted=all'
def __init__(self, url):
self.url = url
self.html = grab_url(url + self.SUFFIX)
open('/tmp/moo', 'w').write(self.html)
open('/tmp/moo3', 'w').write(url+self.SUFFIX)
self._parse(self.html)
def _parse(self, html):
soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES)
self.meta = soup.findAll('meta')
self.seo_title = soup.find('meta', attrs={'name':'hdl'}).get('content')
tmp = soup.find('meta', attrs={'name':'hdl_p'})
if tmp and tmp.get('content'):
self.title = tmp.get('content')
else:
self.title = self.seo_title
self.date = soup.find('meta', attrs={'name':'dat'}).get('content')
self.byline = soup.find('meta', attrs={'name':'byl'}).get('content')
p_tags = soup.findAll('p', attrs={'itemprop':'articleBody'})
self.body = '\n'.join([p.getText() for p in p_tags])
authorids = soup.find('div', attrs={'class':'authorIdentification'})
self.authorid = authorids.getText() if authorids else ''
self.top_correction = '\n'.join(x.getText() for x in
soup.findAll('nyt_correction_top')) or '\n'
self.bottom_correction = '\n'.join(x.getText() for x in
soup.findAll('nyt_correction_bottom')) or '\n'
def __unicode__(self):
return strip_whitespace(u'\n'.join((self.date, self.title, self.byline,
self.top_correction, self.body,
self.authorid,
self.bottom_correction,)))
# XXX CNN might have an issue with unicode
class CNNArticle(Article):
SUFFIX = ''
def _parse(self, html):
soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES)
p_tags = soup.findAll('p', attrs={'class':re.compile(r'\bcnn_storypgraphtxt\b')})
if not p_tags:
self.real_article = False
return
self.meta = soup.findAll('meta')
self.title = soup.find('meta', attrs={'itemprop':'headline'}).get('content')
datestr = soup.find('meta', attrs={'itemprop':'dateModified'}).get('content')
date = datetime.strptime(datestr, '%Y-%m-%dT%H:%M:%SZ') - timedelta(hours=4)
self.date = date.strftime(DATE_FORMAT)
self.byline = soup.find('meta', attrs={'itemprop':'author'}).get('content')
lede = p_tags[0].previousSibling.previousSibling
editornotes = soup.findAll('p', attrs={'class':'cnnEditorialNote'})
contributors = soup.findAll('p', attrs={'class':'cnn_strycbftrtxt'})
self.body = '\n'+'\n\n'.join([p.getText() for p in
editornotes + [lede] + p_tags + contributors])
authorids = soup.find('div', attrs={'class':'authorIdentification'})
def __unicode__(self):
return strip_whitespace(u'\n'.join((self.date, self.title, self.byline,
self.body,)))
class PoliticoArticle(Article):
SUFFIX = ''
def _parse(self, html):
soup = bs4.BeautifulSoup(html)
print_link = soup.findAll('a', text='Print')[0].get('href')
html2 = grab_url(print_link)
# Now we have to switch back to bs3. Hilarious.
# and the labeled encoding is wrong, so force utf-8.
soup = BeautifulSoup(html2, convertEntities=BeautifulSoup.HTML_ENTITIES,
fromEncoding='utf-8')
self.meta = soup.findAll('meta')
p_tags = soup.findAll('p')[1:]
self.title = soup.find('strong').getText()
entity = soup.find('span', attrs={'class':'author'})
children = list(entity.childGenerator())
self.byline = 'By ' + children[1].getText()
datestr = children[-1].strip()
self.date = datestr
self.body = '\n'+'\n\n'.join([p.getText() for p in p_tags])
def __unicode__(self):
return strip_whitespace(u'\n'.join((self.date, self.title, self.byline,
self.body,)))
# NYT blogs
# currently broken
class BlogArticle(Article):
SUFFIX = '?pagemode=print'
def _parse(self, html):
div_id = url_to_filename(self.url).split('.')
soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES)
self.document = soup.find('div', attrs={'id':div_id})
def __unicode__(self):
return strip_whitespace(self.document.getText())
DomainNameToClass = {'www.nytimes.com': Article,
# 'opinionator.blogs.nytimes.com': BlogArticle,
# 'krugman.blogs.nytimes.com': BlogArticle,
'edition.cnn.com': CNNArticle,
'www.politico.com': PoliticoArticle,
}
def get_parser(url):
return DomainNameToClass[url_to_filename(url).split('/')[0]]
def add_to_git_repo(data, filename):
mkdir_p(os.path.dirname(DIFF_DIR+filename))
already_exists = os.path.exists(DIFF_DIR+filename)
open(DIFF_DIR+filename, 'w').write(data)
if not already_exists:
subprocess.call(['git', 'add', filename], cwd=DIFF_DIR)
commit_message = 'Added %s' % filename
return_value = 2
else:
if not subprocess.check_output(['git', 'ls-files', '-m', filename], cwd=DIFF_DIR):
return 0
return_value = 1
commit_message = 'Change to %s' % filename
subprocess.call(['git', 'commit', filename, '-m', commit_message], cwd=DIFF_DIR)
return return_value
#Update url in git
#Return whether it changed
def update_article(url):
try:
parser = get_parser(url)
except KeyError:
print 'Unable to parse domain, skipping'
return
try:
article = parser(url)
except (AttributeError, urllib2.HTTPError, httplib.HTTPException), exc:
print >> sys.stderr, 'Exception when parsing', url
traceback.print_exc()
print 'Continuing'
return -1
if not article.real_article:
return 0
to_store = unicode(article).encode('utf8')
return add_to_git_repo(to_store, url_to_filename(url))
def insert_all_articles(session):
for url in get_all_article_urls():
if session.query(models.Article).filter_by(url=url).first() is None:
session.add(models.Article(url))
session.commit()
def get_update_delay(minutes_since_update):
days_since_update = minutes_since_update // (24 * 60)
if minutes_since_update < 60*3:
return 15
elif days_since_update < 1:
return 30
elif days_since_update < 7:
return 120
elif days_since_update < 30:
return 60*24
else:
return 60*24*7
if __name__ == '__main__':
sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0)
sys.stderr = os.fdopen(sys.stderr.fileno(), 'w', 0)
session = models.Session()
insert_all_articles(session)
num_articles = session.query(models.Article).count()
articles = session.query(models.Article).order_by(models.Article.last_check).all()
articles.sort(key = lambda x: -x.minutes_since_check() * 1./get_update_delay(x.minutes_since_update()))
for i, article_row in enumerate(articles):
print 'Woo:', article_row.minutes_since_update(), article_row.minutes_since_check(), '(%s/%s)' % (i+1, num_articles)
delay = get_update_delay(article_row.minutes_since_update())
if article_row.minutes_since_check() < delay:
continue
print 'Considering', article_row.url, datetime.now()
try:
retcode = update_article(article_row.url)
except Exception, e:
if isinstance(e, urllib2.HTTPError) and e.msg == 'Gone':
retcode = 0
else:
print >> sys.stderr, 'Unknown exception when updating', article_row.url
traceback.print_exc()
retcode = -1
if retcode > 1:
print 'Updated!'
article_row.last_update = datetime.now()
if retcode >= 0:
article_row.last_check = datetime.now()
try:
session.commit()
except sqlalchemy.exc.OperationalError, e:
print >> sys.stderr, 'Exception when committing', url
traceback.print_exc()
print 'Continuing'
Jump to Line
Something went wrong with that request. Please try again.