Skip to content

Commit

Permalink
always retry for hostnames we expect to exist (ia, cc)
Browse files Browse the repository at this point in the history
  • Loading branch information
wumpus committed Oct 8, 2020
1 parent 12357e2 commit dee4ada
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 2 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
- 0.9.28 (not yet tagged)
+ expose warc_version= keyword argument for warc writing (but it's untested and broken for --ia warc)
+ improve dns retry algorithm: always retry for hostnames we expect to exist (ia, cc)

- 0.9.27
+ packaging: fix for using markdown without explicit conversion in setup.py

Expand Down
24 changes: 22 additions & 2 deletions cdx_toolkit/myrequests.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,27 @@
import requests
import logging
import os
import time
from urllib.parse import urlparse

from . import __version__

LOGGER = logging.getLogger(__name__)


previously_seen_hostnames = {
'commoncrawl.s3.amazonaws.com',
'web.archive.org',
'web.archive.org',
}


def dns_fatal(url):
'''We have a dns error, should we fail immediately or not?'''
hostname = urlparse(url).hostname
if hostname not in previously_seen_hostnames:
return True


def myrequests_get(url, params=None, headers=None, cdx=False, allow404=False):
if params:
if 'from_ts' in params:
Expand Down Expand Up @@ -60,7 +74,8 @@ def myrequests_get(url, params=None, headers=None, cdx=False, allow404=False):
string = '{} failures for url {} {!r}: {}'.format(connect_errors, url, params, str(e))

if 'Name or service not known' in string:
raise ValueError('invalid hostname in url '+url) from None
if dns_fatal(url):
raise ValueError('invalid hostname in url '+url) from None

if connect_errors > 100:
LOGGER.error(string)
Expand All @@ -72,4 +87,9 @@ def myrequests_get(url, params=None, headers=None, cdx=False, allow404=False):
except requests.exceptions.RequestException as e: # pragma: no cover
LOGGER.warning('something unexpected happened, giving up after %s', str(e))
raise

hostname = urlparse(url).hostname
if hostname not in previously_seen_hostnames:
previously_seen_hostnames.add(hostname)

return resp

0 comments on commit dee4ada

Please sign in to comment.