Skip to content

Commit

Permalink
Refactor calvin.py
Browse files Browse the repository at this point in the history
  • Loading branch information
cutthroat committed Feb 17, 2012
1 parent a9f895a commit cb32ba2
Showing 1 changed file with 88 additions and 85 deletions.
173 changes: 88 additions & 85 deletions calvin/calvin.py
Expand Up @@ -8,109 +8,112 @@
import mimetypes


CALVIN_URL = r'http://www.gocomics.com/calvinandhobbes/'

IMAGE_REGEX = re.compile(r'(http://cdn.svcs.c2.uclick.com/c2/\w{32})\?width')

DATE_REGEX = re.compile(r'Calvin and Hobbes Comic Strip, (\w+ \d{1,2}, \d{4})')

DATE_FORMAT='%Y-%m-%d'
CALVIN = {
'url': r'http://www.gocomics.com/calvinandhobbes/',
'image': re.compile(r'(http://cdn.svcs.c2.uclick.com/c2/\w{32})\?width'),
'date': re.compile(r'Calvin and Hobbes Comic Strip, (\w+ \d{1,2}, \d{4})'),
}

HEADERS = {
REQUEST_HEADRES = {
'User-Agent': r'Mozilla/5.0 (X11; Linux i686; rv:9.0.1) Gecko/20100101 Firefox/9.0.1',
}


def parse_date(text, default=None):
return datetime.datetime.strptime(text, DATE_FORMAT).date() if text is not None else default
class CalvinError(Exception):
pass

def input_interval():
_, start, end, *_ = sys.argv + [None, None]
today = datetime.date.today()
return parse_date(start, today), parse_date(end, today)
class SaveError(Exception):
pass

def date_range(start, end):
for i in range(start.toordinal(), end.toordinal() + 1):
yield datetime.date.fromordinal(i)
class NotAvailable(Exception):
pass

def read_url(url):
request = urllib.request.Request(url=url, headers=HEADERS)
with urllib.request.urlopen(request) as response:
return response.read(), response.getheader('Content-Type')
class Skip(Exception):
pass

def read_page(date):
content, _ = read_url(CALVIN_URL+date.strftime('%Y/%m/%d'))
return content.decode()

def comic_date(page):
match = DATE_REGEX.search(page)
return datetime.datetime.strptime(match.group(1), '%B %d, %Y').date() if match else None
def date(text):
return datetime.datetime.strptime(text, '%Y-%m-%d')

def image_url(page):
match = IMAGE_REGEX.search(page)
return match.group(1) if match else None
def replay():
log_state = {}
with open('log', 'r') as log:
for line in log:
date_text, url, ext = line.split()
log_state[date(date_text)] = date_text + ext if ext != 'None' else None
return open('log', 'a'), log_state

def save_image(url, date):
content, content_type = read_url(url)
ext = (['.unknown'] + mimetypes.guess_all_extensions(content_type))[-1]
fname = date.strftime(DATE_FORMAT) + ext
if os.path.exists(fname):
return
with open(fname, 'wb') as file:
def read(url):
global REQUEST_HEADERS
request = urllib.request.Request(url=url, headers=REQUEST_HEADRES)
with urllib.request.urlopen(request) as response:
content_type = response.getheader('Content-Type')
ext = ([None] + mimetypes.guess_all_extensions(content_type))[-1]
return response.read(), content_type, ext

def find(date):
page = read(CALVIN['url'] + date.strftime('%Y/%m/%d'))[0].decode()
date_match = CALVIN['date'].search(page)
if not date_match:
raise CalvinError('No date')
page_date = datetime.datetime.strptime(date_match.group(1), '%B %d, %Y').date()
if page_date != date:
raise NotAvailable
image_match = CALVIN['image'].search(page)
if not image_match:
raise CalvinError('No image')
image_url = image_match.group(1)
return image_url

def save(image_url, date):
(content, content_type, ext) = read(image_url)
path = date.strftime('%Y-%m-%d') + (ext or '')
if os.path.exists(path):
raise SaveError('File exists')
with open(path, 'wb') as file:
file.write(content)
return ext


# replay log
with open('log', 'r') as LOG:
LOG_STATE = {}
for line in LOG:
text_date, url, ext = line.split()
LOG_STATE[parse_date(text_date)] = text_date + ext if ext != 'None' else None

# reopen log
LOG = open('log', 'a')


def got_date(date):
print(date, end=' ')
sys.stdout.flush()

def got_already():
print('skip')

def got_no_image(date):
if date in LOG_STATE and LOG_STATE[date] is not None:
print(date, None, None, file=LOG)
print('None')

def got_url():
print('.', end='')
sys.stdout.flush()

def got_image(date, url, ext):
if not ext:
print(' file exists')
def proceed(date, log_state):
if date not in log_state:
return
print(date, url, ext, file=LOG)
print(' ok')
if not log_state[date]:
raise NotAvailable
if log_state[date] and os.path.exists(log_state[date]):
raise Skip

def download(last, end, log):
log_file, log_state = log
for ord in range(last.toordinal() + 1, end.toordinal() + 1):
date = datetime.date.fromordinal(ord)
date_text = date.strftime('%Y-%m-%d')
print(date_text, end=' ')
sys.stdout.flush() # it'd be nice if the reporter was contextmanager
try:
proceed(date, log_state)
image_url = find(date)
ext = save(image_url, date)
print(date_text, image_url, ext, file=log_file)
print('ok')
except CalvinError as e:
print(e)
except SaveError as e:
print(e)
except NotAvailable:
print(date_text, None, None, file=log_file)
print('na')
except Skip:
print('--')


def main():
log = replay()
today = datetime.date.today()
last_entry = max(log[1].keys()) if log[1] else today
download(last_entry, today, log)


if __name__ == '__main__':
start, end = input_interval()
for date in date_range(start, end):
got_date(date)
if date in LOG_STATE and LOG_STATE[date] and os.path.exists(LOG_STATE[date]):
got_already()
continue
page = read_page(date)
url = image_url(page)
if url is None or comic_date(page) != date:
got_no_image(date)
continue
got_url()
ext = save_image(url, date)
got_image(date, url, ext)

main()

# vim: et sw=4 sts=4

0 comments on commit cb32ba2

Please sign in to comment.