From 3238ac35a1ccbd09dd6bcb13ceed798047349fcf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Freitag?= Date: Sun, 4 Oct 2020 21:40:14 +0200 Subject: [PATCH] Fix #6629: linkcheck: Handle rate-limiting Follow the Retry-After header if present, otherwise use an exponential back-off. Close #7388 --- CHANGES | 2 + doc/conf.py | 5 +- doc/usage/builders/index.rst | 4 + doc/usage/configuration.rst | 17 ++++ sphinx/builders/linkcheck.py | 105 +++++++++++++++++++++-- tests/test_build_linkcheck.py | 154 ++++++++++++++++++++++++++++++++++ 6 files changed, 277 insertions(+), 10 deletions(-) diff --git a/CHANGES b/CHANGES index 5ad0d36a57a..f877efc1b81 100644 --- a/CHANGES +++ b/CHANGES @@ -33,6 +33,8 @@ Features added * #6914: Add a new event :event:`warn-missing-reference` to custom warning messages when failed to resolve a cross-reference * #6914: Emit a detailed warning when failed to resolve a ``:ref:`` reference +* #6629: The linkcheck builder now handles rate limits. See + :confval:`linkcheck_retry_on_rate_limit` for details. Bugs fixed ---------- diff --git a/doc/conf.py b/doc/conf.py index 80ee2b01ccd..b995cca0832 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -110,7 +110,10 @@ 1), ] -intersphinx_mapping = {'python': ('https://docs.python.org/3/', None)} +intersphinx_mapping = { + 'python': ('https://docs.python.org/3/', None), + 'requests': ('https://requests.readthedocs.io/en/master', None), +} # Sphinx document translation with sphinx gettext feature uses these settings: locale_dirs = ['locale/'] diff --git a/doc/usage/builders/index.rst b/doc/usage/builders/index.rst index db6706944fd..c45a8062fbb 100644 --- a/doc/usage/builders/index.rst +++ b/doc/usage/builders/index.rst @@ -442,6 +442,10 @@ name is ``rinoh``. Refer to the `rinohtype manual`_ for details. Since Sphinx-1.5, the linkcheck builder comes to use requests module. + .. versionchanged:: 3.4 + + The linkcheck builder retries links when servers apply rate limits. + .. module:: sphinx.builders.xml .. class:: XMLBuilder diff --git a/doc/usage/configuration.rst b/doc/usage/configuration.rst index 4881b86295c..ff651f75a6c 100644 --- a/doc/usage/configuration.rst +++ b/doc/usage/configuration.rst @@ -2525,6 +2525,23 @@ Options for the linkcheck builder .. versionadded:: 2.3 +.. confval:: linkcheck_rate_limit_timeout + + The ``linkcheck`` builder may issue a large number of requests to the same + site over a short period of time. This setting controls the builder behavior + when servers indicate that requests are rate-limited. + + If a server indicates when to retry (using the `Retry-After`_ header), + ``linkcheck`` always follows the server indication. + + Otherwise, ``linkcheck`` waits for a minute before to retry and keeps + doubling the wait time between attempts until it succeeds or exceeds the + ``linkcheck_rate_limit_timeout``. By default, the timeout is 5 minutes. + + .. _Retry-After: https://tools.ietf.org/html/rfc2616#section-14.37 + + .. versionadded:: 3.4 + Options for the XML builder --------------------------- diff --git a/sphinx/builders/linkcheck.py b/sphinx/builders/linkcheck.py index 1dc0337c3b1..60b06c591cf 100644 --- a/sphinx/builders/linkcheck.py +++ b/sphinx/builders/linkcheck.py @@ -13,13 +13,17 @@ import re import socket import threading +import time +from datetime import datetime, timezone +from email.utils import parsedate_to_datetime from html.parser import HTMLParser from os import path -from typing import Any, Dict, List, Set, Tuple +from typing import Any, Dict, List, NamedTuple, Optional, Set, Tuple from urllib.parse import unquote, urlparse from docutils import nodes from docutils.nodes import Node +from requests import Response from requests.exceptions import HTTPError from sphinx.application import Sphinx @@ -33,10 +37,14 @@ uri_re = re.compile('([a-z]+:)?//') # matches to foo:// and // (a protocol relative URL) +RateLimit = NamedTuple('RateLimit', (('delay', float), ('next_check', float))) DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml;q=0.9,*/*;q=0.8', } +CHECK_IMMEDIATELY = 0 +QUEUE_POLL_SECS = 1 +DEFAULT_DELAY = 60.0 class AnchorCheckParser(HTMLParser): @@ -98,7 +106,8 @@ def init(self) -> None: open(path.join(self.outdir, 'output.json'), 'w').close() # create queues and worker threads - self.wqueue = queue.Queue() # type: queue.Queue + self.rate_limits = {} # type: Dict[str, RateLimit] + self.wqueue = queue.PriorityQueue() # type: queue.PriorityQueue self.rqueue = queue.Queue() # type: queue.Queue self.workers = [] # type: List[threading.Thread] for i in range(self.app.config.linkcheck_workers): @@ -172,16 +181,25 @@ def check_uri() -> Tuple[str, str, int]: config=self.app.config, auth=auth_info, **kwargs) response.raise_for_status() - except HTTPError: + except HTTPError as err: + if err.response.status_code == 429: + raise # retry with GET request if that fails, some servers # don't like HEAD requests. - response = requests.get(req_url, stream=True, config=self.app.config, + response = requests.get(req_url, stream=True, + config=self.app.config, auth=auth_info, **kwargs) response.raise_for_status() except HTTPError as err: if err.response.status_code == 401: # We'll take "Unauthorized" as working. return 'working', ' - unauthorized', 0 + elif err.response.status_code == 429: + next_check = self.limit_rate(err.response) + if next_check is not None: + self.wqueue.put((next_check, uri, docname, lineno), False) + return 'rate-limited', '', 0 + return 'broken', str(err), 0 elif err.response.status_code == 503: # We'll take "Service Unavailable" as ignored. return 'ignored', str(err), 0 @@ -189,6 +207,12 @@ def check_uri() -> Tuple[str, str, int]: return 'broken', str(err), 0 except Exception as err: return 'broken', str(err), 0 + else: + netloc = urlparse(req_url).netloc + try: + del self.rate_limits[netloc] + except KeyError: + pass if response.url.rstrip('/') == req_url.rstrip('/'): return 'working', '', 0 else: @@ -247,11 +271,69 @@ def check(docname: str) -> Tuple[str, str, int]: return (status, info, code) while True: - uri, docname, lineno = self.wqueue.get() + next_check, uri, docname, lineno = self.wqueue.get() if uri is None: break + netloc = urlparse(uri).netloc + try: + # Refresh rate limit. + # When there are many links in the queue, workers are all stuck waiting + # for responses, but the builder keeps queuing. Links in the queue may + # have been queued before rate limits were discovered. + next_check = self.rate_limits[netloc].next_check + except KeyError: + pass + if next_check > time.time(): + # Sleep before putting message back in the queue to avoid + # waking up other threads. + time.sleep(QUEUE_POLL_SECS) + self.wqueue.put((next_check, uri, docname, lineno), False) + self.wqueue.task_done() + continue status, info, code = check(docname) - self.rqueue.put((uri, docname, lineno, status, info, code)) + if status == 'rate-limited': + logger.info(darkgray('-rate limited- ') + uri + darkgray(' | sleeping...')) + else: + self.rqueue.put((uri, docname, lineno, status, info, code)) + self.wqueue.task_done() + + def limit_rate(self, response: Response) -> Optional[float]: + next_check = None + retry_after = response.headers.get("Retry-After") + if retry_after: + try: + # Integer: time to wait before next attempt. + delay = float(retry_after) + except ValueError: + try: + # An HTTP-date: time of next attempt. + until = parsedate_to_datetime(retry_after) + except (TypeError, ValueError): + # TypeError: Invalid date format. + # ValueError: Invalid date, e.g. Oct 52th. + pass + else: + next_check = datetime.timestamp(until) + delay = (until - datetime.now(timezone.utc)).total_seconds() + else: + next_check = time.time() + delay + netloc = urlparse(response.url).netloc + if next_check is None: + max_delay = self.app.config.linkcheck_rate_limit_timeout + try: + rate_limit = self.rate_limits[netloc] + except KeyError: + delay = DEFAULT_DELAY + else: + last_wait_time = rate_limit.delay + delay = 2.0 * last_wait_time + if delay > max_delay and last_wait_time < max_delay: + delay = max_delay + if delay > max_delay: + return None + next_check = time.time() + delay + self.rate_limits[netloc] = RateLimit(delay, next_check) + return next_check def process_result(self, result: Tuple[str, str, int, str, str, int]) -> None: uri, docname, lineno, status, info, code = result @@ -325,7 +407,8 @@ def write_doc(self, docname: str, doctree: Node) -> None: continue uri = refnode['refuri'] lineno = get_node_line(refnode) - self.wqueue.put((uri, docname, lineno), False) + uri_info = (CHECK_IMMEDIATELY, uri, docname, lineno) + self.wqueue.put(uri_info, False) n += 1 # image nodes @@ -333,7 +416,8 @@ def write_doc(self, docname: str, doctree: Node) -> None: uri = imgnode['candidates'].get('?') if uri and '://' in uri: lineno = get_node_line(imgnode) - self.wqueue.put((uri, docname, lineno), False) + uri_info = (CHECK_IMMEDIATELY, uri, docname, lineno) + self.wqueue.put(uri_info, False) n += 1 done = 0 @@ -355,8 +439,10 @@ def write_linkstat(self, data: dict) -> None: output.write('\n') def finish(self) -> None: + self.wqueue.join() + # Shutdown threads. for worker in self.workers: - self.wqueue.put((None, None, None), False) + self.wqueue.put((CHECK_IMMEDIATELY, None, None, None), False) def setup(app: Sphinx) -> Dict[str, Any]: @@ -372,6 +458,7 @@ def setup(app: Sphinx) -> Dict[str, Any]: # Anchors starting with ! are ignored since they are # commonly used for dynamic pages app.add_config_value('linkcheck_anchors_ignore', ["^!"], None) + app.add_config_value('linkcheck_rate_limit_timeout', 300.0, None) return { 'version': 'builtin', diff --git a/tests/test_build_linkcheck.py b/tests/test_build_linkcheck.py index c09c81fe08c..45497c7854f 100644 --- a/tests/test_build_linkcheck.py +++ b/tests/test_build_linkcheck.py @@ -10,13 +10,23 @@ import http.server import json +import re +import sys import textwrap +import time +import wsgiref.handlers +from datetime import datetime +from unittest import mock import pytest import requests +from sphinx.util.console import strip_colors + from .utils import CERT_FILE, http_server, https_server, modify_env +ts_re = re.compile(r".*\[(?P.*)\].*") + @pytest.mark.sphinx('linkcheck', testroot='linkcheck', freshenv=True) def test_defaults(app): @@ -382,3 +392,147 @@ def test_connect_to_selfsigned_nonexistent_cert_file(app): "uri": "https://localhost:7777/", "info": "Could not find a suitable TLS CA certificate bundle, invalid path: does/not/exist", } + + +def make_retry_after_handler(responses, log_time=False): + class RetryAfterHandler(http.server.BaseHTTPRequestHandler): + def do_HEAD(self): + status, retry_after = responses.pop(0) + self.send_response(status) + if retry_after: + self.send_header('Retry-After', retry_after) + self.end_headers() + + def log_date_time_string(self): + """Strip date and time from logged messages for assertions.""" + if log_time: + return datetime.now().isoformat() + return "" + + return RetryAfterHandler + + +@pytest.mark.sphinx('linkcheck', testroot='linkcheck-localserver', freshenv=True) +def test_too_many_requests_retry_after_int_delay(app, capsys, status): + with http_server(make_retry_after_handler([(429, "0"), (200, None)])), \ + mock.patch("sphinx.builders.linkcheck.DEFAULT_DELAY", 0), \ + mock.patch("sphinx.builders.linkcheck.QUEUE_POLL_SECS", 0.01): + app.builder.build_all() + content = (app.outdir / 'output.json').read_text() + assert json.loads(content) == { + "filename": "index.rst", + "lineno": 1, + "status": "working", + "code": 0, + "uri": "http://localhost:7777/", + "info": "", + } + rate_limit_log = "-rate limited- http://localhost:7777/ | sleeping...\n" + assert rate_limit_log in strip_colors(status.getvalue()) + _stdout, stderr = capsys.readouterr() + assert stderr == textwrap.dedent( + """\ + 127.0.0.1 - - [] "HEAD / HTTP/1.1" 429 - + 127.0.0.1 - - [] "HEAD / HTTP/1.1" 200 - + """ + ) + + +@pytest.mark.sphinx('linkcheck', testroot='linkcheck-localserver', freshenv=True) +def test_too_many_requests_retry_after_HTTP_date(app, capsys): + now = datetime.now().timetuple() + retry_after = wsgiref.handlers.format_date_time(time.mktime(now)) + with http_server(make_retry_after_handler([(429, retry_after), (200, None)])): + app.builder.build_all() + content = (app.outdir / 'output.json').read_text() + assert json.loads(content) == { + "filename": "index.rst", + "lineno": 1, + "status": "working", + "code": 0, + "uri": "http://localhost:7777/", + "info": "", + } + _stdout, stderr = capsys.readouterr() + assert stderr == textwrap.dedent( + """\ + 127.0.0.1 - - [] "HEAD / HTTP/1.1" 429 - + 127.0.0.1 - - [] "HEAD / HTTP/1.1" 200 - + """ + ) + + +@pytest.mark.sphinx('linkcheck', testroot='linkcheck-localserver', freshenv=True) +def test_too_many_requests_retry_after_without_header(app, capsys): + with http_server(make_retry_after_handler([(429, None), (200, None)])),\ + mock.patch("sphinx.builders.linkcheck.DEFAULT_DELAY", 0): + app.builder.build_all() + content = (app.outdir / 'output.json').read_text() + assert json.loads(content) == { + "filename": "index.rst", + "lineno": 1, + "status": "working", + "code": 0, + "uri": "http://localhost:7777/", + "info": "", + } + _stdout, stderr = capsys.readouterr() + assert stderr == textwrap.dedent( + """\ + 127.0.0.1 - - [] "HEAD / HTTP/1.1" 429 - + 127.0.0.1 - - [] "HEAD / HTTP/1.1" 200 - + """ + ) + + +@pytest.mark.sphinx('linkcheck', testroot='linkcheck-localserver', freshenv=True) +def test_too_many_requests_user_timeout(app, capsys): + app.config.linkcheck_rate_limit_timeout = 0.0 + with http_server(make_retry_after_handler([(429, None)])): + app.builder.build_all() + content = (app.outdir / 'output.json').read_text() + assert json.loads(content) == { + "filename": "index.rst", + "lineno": 1, + "status": "broken", + "code": 0, + "uri": "http://localhost:7777/", + "info": "429 Client Error: Too Many Requests for url: http://localhost:7777/", + } + + +@pytest.mark.sphinx('linkcheck', testroot='linkcheck-localserver', freshenv=True) +def test_too_many_requests_retry_exponential_backoff(app, capsys): + retry_handler = make_retry_after_handler( + [(429, None), (429, None), (200, None)], + log_time=True, + ) + with http_server(retry_handler),\ + mock.patch("sphinx.builders.linkcheck.DEFAULT_DELAY", 1), \ + mock.patch("sphinx.builders.linkcheck.QUEUE_POLL_SECS", 0.01): + start = datetime.now() + app.builder.build_all() + content = (app.outdir / 'output.json').read_text() + assert json.loads(content) == { + "filename": "index.rst", + "lineno": 1, + "status": "working", + "code": 0, + "uri": "http://localhost:7777/", + "info": "", + } + _stdout, stderr = capsys.readouterr() + req_logs = stderr.splitlines() + timestamps = [] + assert len(req_logs) == 3 + for line in req_logs: + timestamp_str = ts_re.match(line).group("ts") + if sys.version_info >= (3, 7): + timestamp = datetime.fromisoformat(timestamp_str) + else: + timestamp = datetime.strptime(timestamp_str, "%Y-%m-%dT%H:%M:%S.%f") + timestamps.append(timestamp) + # Duration which allocated to the thread to do its work. + margin = 1.0 + assert 1.0 <= (timestamps[1] - start).total_seconds() < 1.0 + margin + assert 2.0 <= (timestamps[2] - timestamps[1]).total_seconds() < 2.0 + margin