edgi-govdata-archiving · janakrajchadha · May 17, 2017 · May 24, 2017 · Jun 2, 2017 · Jun 2, 2017
diff --git a/web_monitoring/internetarchive.py b/web_monitoring/internetarchive.py
@@ -34,6 +34,24 @@ class UnexpectedResponseFormat(WebMonitoringException):
 URL_CHUNK_PATTERN = re.compile('\<(.*)\>')
 DATETIME_CHUNK_PATTERN = re.compile(' datetime="(.*)",')
 
+def check_exists(lines):    
+    """
+    Check if Internet Archive has archived versions of a url.
+    """
+
+
+    try:
+        # The first three lines contain no information we need.
+        for _ in range(3):
+            next(lines)
+
+
+    except StopIteration:
+        print("Internet archive does not have archived versions of this url.")
+        return False
+
+    return True
+
 
 def list_versions(url):
     """
@@ -68,40 +86,38 @@ def list_versions(url):
     res = requests.get(first_page_url)
     lines = res.iter_lines()
 
-    while True:
-        # Continue requesting pages of responses until the last page.
-        try:
-            # The first three lines contain no information we need.
-            for _ in range(3):
-                next(lines)
-        except StopIteration:
-            # There are no more pages left to parse.
-            break
-        for line in lines:
-            # Lines are made up semicolon-separated chunks:
-            # b'<http://web.archive.org/web/19961231235847/http://www.nasa.gov:80/>; rel="memento"; datetime="Tue, 31 Dec 1996 23:58:47 GMT",'
-
-            # Split by semicolon. Fail with an informative error if there are
-            # not exactly three chunks.
-            try:
-                url_chunk, rel_chunk, dt_chunk = line.decode().split(';')
-            except ValueError:
-                raise UnexpectedResponseFormat(line.decode())
-
-            if 'timemap' in rel_chunk:
-                # This line is a link to the next page of mementos.
-                next_page_url, = URL_CHUNK_PATTERN.match(url_chunk).groups()
-                res = requests.get(next_page_url)
-                lines = res.iter_lines()
-                break
-
-            # Extract the URL and the datetime from the surrounding characters.
-            # Again, fail with an informative error.
-            try:
-                uri, = URL_CHUNK_PATTERN.match(url_chunk).groups()
-                dt_str, = DATETIME_CHUNK_PATTERN.match(dt_chunk).groups()
-            except AttributeError:
-                raise UnexpectedResponseFormat(line.decode())
-
-            dt = datetime.strptime(dt_str, DATE_FMT)
-            yield dt, uri
+    exists = check_exists(lines)
+    if exists:
+
+        while True:
+
+            for line in lines:
+                # Lines are made up semicolon-separated chunks:
+                # b'<http://web.archive.org/web/19961231235847/http://www.nasa.gov:80/>; rel="memento"; datetime="Tue, 31 Dec 1996 23:58:47 GMT",'
+
+                # Split by semicolon. Fail with an informative error if there are
+                # not exactly three chunks.
+                try:
+                    url_chunk, rel_chunk, dt_chunk = line.decode().split(';')
+                except ValueError:
+                    raise UnexpectedResponseFormat(line.decode())
+
+                if 'timemap' in rel_chunk:
+                    # This line is a link to the next page of mementos.
+                    next_page_url, = URL_CHUNK_PATTERN.match(url_chunk).groups()
+                    res = requests.get(next_page_url)
+                    lines = res.iter_lines()
+                    break
+
+                # Extract the URL and the datetime from the surrounding characters.
+                # Again, fail with an informative error.
+                try:
+                    uri, = URL_CHUNK_PATTERN.match(url_chunk).groups()
+                    dt_str, = DATETIME_CHUNK_PATTERN.match(dt_chunk).groups()
+                except AttributeError:
+                    raise UnexpectedResponseFormat(line.decode())
+
+                dt = datetime.strptime(dt_str, DATE_FMT)
+                yield dt, uri
+    else:
+        yield None,None