Skip to content

Commit

Permalink
Merge pull request #3644 from stefanor/link-check-timeout
Browse files Browse the repository at this point in the history
Add a (configurable) timeout to the link checker
  • Loading branch information
Kwpolska committed Sep 3, 2022
2 parents 1f487e6 + 680bdb5 commit 8780db9
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 3 deletions.
1 change: 1 addition & 0 deletions AUTHORS.txt
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@
* `Sean Pue <https://github.com/seanpue>`_
* `Simon van der Veldt <https://github.com/simonvanderveldt>`_
* `Stefan Näwe <https://github.com/snaewe>`_
* `Stefano Rivera <https://github.com/stefanor>`_
* `Stephan Fitzpatrick <https://github.com/knowsuchagency>`_
* `Sukil Etxenike <https://github.com/sukiletxe>`_
* `Ted Timmons <https://github.com/tedder>`_
Expand Down
9 changes: 9 additions & 0 deletions CHANGES.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,12 @@
New in master
=============

Features
--------

* Add a ``--timeout`` parameter to the ``check`` plugin, defaulting to
30s. (Issue #3643)

New in v8.2.3
=============

Expand Down
16 changes: 13 additions & 3 deletions nikola/plugins/command/check.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,14 @@ class CommandCheck(Command):
'default': False,
'help': 'Check that remote links work.',
},
{
'name': 'timeout',
'long': 'timeout',
'short': 't',
'type': int,
'default': 30,
'help': 'Timeout (in seconds) for HTTP requests in remote checks.',
},
]

def _execute(self, options, args):
Expand All @@ -160,6 +168,7 @@ def _execute(self, options, args):
else:
self.logger.level = logging.WARNING
failure = False
self.timeout = options['timeout']
if options['links']:
failure |= self.scan_links(options['find_sources'], options['remote'])
if options['files']:
Expand All @@ -171,6 +180,7 @@ def _execute(self, options, args):

existing_targets = set([])
checked_remote_targets = {}
timeout = None
cache = {}

def analyze(self, fname, find_sources=False, check_remote=False):
Expand Down Expand Up @@ -279,19 +289,19 @@ def analyze(self, fname, find_sources=False, check_remote=False):

# Check the remote link works
req_headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0 (Nikola)'} # I’m a real boy!
resp = requests.head(target, headers=req_headers, allow_redirects=False)
resp = requests.head(target, headers=req_headers, allow_redirects=False, timeout=self.timeout)

# Retry client errors (4xx) as GET requests because many servers are broken
if resp.status_code >= 400 and resp.status_code <= 499:
time.sleep(0.5)
resp = requests.get(target, headers=req_headers, allow_redirects=False)
resp = requests.get(target, headers=req_headers, allow_redirects=False, timeout=self.timeout)

# Follow redirects and see where they lead, redirects to errors will be reported twice
if resp.status_code in [301, 302, 307, 308]:
redir_status_code = resp.status_code
time.sleep(0.5)
# Known redirects are retested using GET because IIS servers otherwise get HEADaches
resp = requests.get(target, headers=req_headers, allow_redirects=True)
resp = requests.get(target, headers=req_headers, allow_redirects=True, timeout=self.timeout)
# Permanent redirects should be updated
if redir_status_code in [301, 308]:
self.logger.warning("Remote link moved PERMANENTLY to \"{0}\" and should be updated in {1}: {2} [HTTP: {3}]".format(resp.url, filename, target, redir_status_code))
Expand Down

0 comments on commit 8780db9

Please sign in to comment.