diff --git a/calvin/calvin.py b/calvin/calvin.py index 635fd04..180e804 100755 --- a/calvin/calvin.py +++ b/calvin/calvin.py @@ -8,109 +8,112 @@ import mimetypes -CALVIN_URL = r'http://www.gocomics.com/calvinandhobbes/' - -IMAGE_REGEX = re.compile(r'(http://cdn.svcs.c2.uclick.com/c2/\w{32})\?width') - -DATE_REGEX = re.compile(r'Calvin and Hobbes Comic Strip, (\w+ \d{1,2}, \d{4})') - -DATE_FORMAT='%Y-%m-%d' +CALVIN = { + 'url': r'http://www.gocomics.com/calvinandhobbes/', + 'image': re.compile(r'(http://cdn.svcs.c2.uclick.com/c2/\w{32})\?width'), + 'date': re.compile(r'Calvin and Hobbes Comic Strip, (\w+ \d{1,2}, \d{4})'), +} -HEADERS = { +REQUEST_HEADRES = { 'User-Agent': r'Mozilla/5.0 (X11; Linux i686; rv:9.0.1) Gecko/20100101 Firefox/9.0.1', } -def parse_date(text, default=None): - return datetime.datetime.strptime(text, DATE_FORMAT).date() if text is not None else default +class CalvinError(Exception): + pass -def input_interval(): - _, start, end, *_ = sys.argv + [None, None] - today = datetime.date.today() - return parse_date(start, today), parse_date(end, today) +class SaveError(Exception): + pass -def date_range(start, end): - for i in range(start.toordinal(), end.toordinal() + 1): - yield datetime.date.fromordinal(i) +class NotAvailable(Exception): + pass -def read_url(url): - request = urllib.request.Request(url=url, headers=HEADERS) - with urllib.request.urlopen(request) as response: - return response.read(), response.getheader('Content-Type') +class Skip(Exception): + pass -def read_page(date): - content, _ = read_url(CALVIN_URL+date.strftime('%Y/%m/%d')) - return content.decode() -def comic_date(page): - match = DATE_REGEX.search(page) - return datetime.datetime.strptime(match.group(1), '%B %d, %Y').date() if match else None +def date(text): + return datetime.datetime.strptime(text, '%Y-%m-%d') -def image_url(page): - match = IMAGE_REGEX.search(page) - return match.group(1) if match else None +def replay(): + log_state = {} + with open('log', 'r') as log: + for line in log: + date_text, url, ext = line.split() + log_state[date(date_text)] = date_text + ext if ext != 'None' else None + return open('log', 'a'), log_state -def save_image(url, date): - content, content_type = read_url(url) - ext = (['.unknown'] + mimetypes.guess_all_extensions(content_type))[-1] - fname = date.strftime(DATE_FORMAT) + ext - if os.path.exists(fname): - return - with open(fname, 'wb') as file: +def read(url): + global REQUEST_HEADERS + request = urllib.request.Request(url=url, headers=REQUEST_HEADRES) + with urllib.request.urlopen(request) as response: + content_type = response.getheader('Content-Type') + ext = ([None] + mimetypes.guess_all_extensions(content_type))[-1] + return response.read(), content_type, ext + +def find(date): + page = read(CALVIN['url'] + date.strftime('%Y/%m/%d'))[0].decode() + date_match = CALVIN['date'].search(page) + if not date_match: + raise CalvinError('No date') + page_date = datetime.datetime.strptime(date_match.group(1), '%B %d, %Y').date() + if page_date != date: + raise NotAvailable + image_match = CALVIN['image'].search(page) + if not image_match: + raise CalvinError('No image') + image_url = image_match.group(1) + return image_url + +def save(image_url, date): + (content, content_type, ext) = read(image_url) + path = date.strftime('%Y-%m-%d') + (ext or '') + if os.path.exists(path): + raise SaveError('File exists') + with open(path, 'wb') as file: file.write(content) return ext - -# replay log -with open('log', 'r') as LOG: - LOG_STATE = {} - for line in LOG: - text_date, url, ext = line.split() - LOG_STATE[parse_date(text_date)] = text_date + ext if ext != 'None' else None - -# reopen log -LOG = open('log', 'a') - - -def got_date(date): - print(date, end=' ') - sys.stdout.flush() - -def got_already(): - print('skip') - -def got_no_image(date): - if date in LOG_STATE and LOG_STATE[date] is not None: - print(date, None, None, file=LOG) - print('None') - -def got_url(): - print('.', end='') - sys.stdout.flush() - -def got_image(date, url, ext): - if not ext: - print(' file exists') +def proceed(date, log_state): + if date not in log_state: return - print(date, url, ext, file=LOG) - print(' ok') + if not log_state[date]: + raise NotAvailable + if log_state[date] and os.path.exists(log_state[date]): + raise Skip + +def download(last, end, log): + log_file, log_state = log + for ord in range(last.toordinal() + 1, end.toordinal() + 1): + date = datetime.date.fromordinal(ord) + date_text = date.strftime('%Y-%m-%d') + print(date_text, end=' ') + sys.stdout.flush() # it'd be nice if the reporter was contextmanager + try: + proceed(date, log_state) + image_url = find(date) + ext = save(image_url, date) + print(date_text, image_url, ext, file=log_file) + print('ok') + except CalvinError as e: + print(e) + except SaveError as e: + print(e) + except NotAvailable: + print(date_text, None, None, file=log_file) + print('na') + except Skip: + print('--') + + +def main(): + log = replay() + today = datetime.date.today() + last_entry = max(log[1].keys()) if log[1] else today + download(last_entry, today, log) if __name__ == '__main__': - start, end = input_interval() - for date in date_range(start, end): - got_date(date) - if date in LOG_STATE and LOG_STATE[date] and os.path.exists(LOG_STATE[date]): - got_already() - continue - page = read_page(date) - url = image_url(page) - if url is None or comic_date(page) != date: - got_no_image(date) - continue - got_url() - ext = save_image(url, date) - got_image(date, url, ext) - + main() # vim: et sw=4 sts=4