-
-
Notifications
You must be signed in to change notification settings - Fork 20
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Exception handling for IA #42
Changes from 1 commit
d133f66
bedfb90
50ed719
49296e4
8411db0
7c4c04b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -34,6 +34,24 @@ class UnexpectedResponseFormat(WebMonitoringException): | |
URL_CHUNK_PATTERN = re.compile('\<(.*)\>') | ||
DATETIME_CHUNK_PATTERN = re.compile(' datetime="(.*)",') | ||
|
||
def check_exists(lines): | ||
""" | ||
Check if Internet Archive has archived versions of a url. | ||
""" | ||
|
||
|
||
try: | ||
# The first three lines contain no information we need. | ||
for _ in range(3): | ||
next(lines) | ||
|
||
|
||
except StopIteration: | ||
print("Internet archive does not have archived versions of this url.") | ||
return False | ||
|
||
return True | ||
|
||
|
||
def list_versions(url): | ||
""" | ||
|
@@ -68,40 +86,38 @@ def list_versions(url): | |
res = requests.get(first_page_url) | ||
lines = res.iter_lines() | ||
|
||
while True: | ||
# Continue requesting pages of responses until the last page. | ||
try: | ||
# The first three lines contain no information we need. | ||
for _ in range(3): | ||
next(lines) | ||
except StopIteration: | ||
# There are no more pages left to parse. | ||
break | ||
for line in lines: | ||
# Lines are made up semicolon-separated chunks: | ||
# b'<http://web.archive.org/web/19961231235847/http://www.nasa.gov:80/>; rel="memento"; datetime="Tue, 31 Dec 1996 23:58:47 GMT",' | ||
|
||
# Split by semicolon. Fail with an informative error if there are | ||
# not exactly three chunks. | ||
try: | ||
url_chunk, rel_chunk, dt_chunk = line.decode().split(';') | ||
except ValueError: | ||
raise UnexpectedResponseFormat(line.decode()) | ||
|
||
if 'timemap' in rel_chunk: | ||
# This line is a link to the next page of mementos. | ||
next_page_url, = URL_CHUNK_PATTERN.match(url_chunk).groups() | ||
res = requests.get(next_page_url) | ||
lines = res.iter_lines() | ||
break | ||
|
||
# Extract the URL and the datetime from the surrounding characters. | ||
# Again, fail with an informative error. | ||
try: | ||
uri, = URL_CHUNK_PATTERN.match(url_chunk).groups() | ||
dt_str, = DATETIME_CHUNK_PATTERN.match(dt_chunk).groups() | ||
except AttributeError: | ||
raise UnexpectedResponseFormat(line.decode()) | ||
|
||
dt = datetime.strptime(dt_str, DATE_FMT) | ||
yield dt, uri | ||
exists = check_exists(lines) | ||
if exists: | ||
|
||
while True: | ||
|
||
for line in lines: | ||
# Lines are made up semicolon-separated chunks: | ||
# b'<http://web.archive.org/web/19961231235847/http://www.nasa.gov:80/>; rel="memento"; datetime="Tue, 31 Dec 1996 23:58:47 GMT",' | ||
|
||
# Split by semicolon. Fail with an informative error if there are | ||
# not exactly three chunks. | ||
try: | ||
url_chunk, rel_chunk, dt_chunk = line.decode().split(';') | ||
except ValueError: | ||
raise UnexpectedResponseFormat(line.decode()) | ||
|
||
if 'timemap' in rel_chunk: | ||
# This line is a link to the next page of mementos. | ||
next_page_url, = URL_CHUNK_PATTERN.match(url_chunk).groups() | ||
res = requests.get(next_page_url) | ||
lines = res.iter_lines() | ||
break | ||
|
||
# Extract the URL and the datetime from the surrounding characters. | ||
# Again, fail with an informative error. | ||
try: | ||
uri, = URL_CHUNK_PATTERN.match(url_chunk).groups() | ||
dt_str, = DATETIME_CHUNK_PATTERN.match(dt_chunk).groups() | ||
except AttributeError: | ||
raise UnexpectedResponseFormat(line.decode()) | ||
|
||
dt = datetime.strptime(dt_str, DATE_FMT) | ||
yield dt, uri | ||
else: | ||
yield None,None | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would prefer this to fail more loudly than yielding There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree with you. Like I had mentioned earlier, this is just a simple solution I've tried to get it running. I'm trying to completely move the url checking a layer up and will push that code soon. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sounds good. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I appreciate the clarification here of what is happening with
StopIteration
. To refine it a bit, I would make this sentence a code comment instead of a call toprint
. Library code should almost never print because it might "spam" the screen in the way that the application calling it does not want, and the application has no easy way of silencing the printing.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Alright, I'll change it to a comment. I guess I'll learn more about good practices for library code with experience.