From 9cbd51121b16cfe2c77f141e13088fdf7e28bcf5 Mon Sep 17 00:00:00 2001 From: Gianni Tedesco Date: Wed, 26 Oct 2022 20:27:31 +0900 Subject: [PATCH] Replace quadratic algo in LineDecoder Leading to enormous speedups when doing things such as Response(...).iter_lines() as described on issue #2422 --- httpx/_decoders.py | 65 ++++++++++------------------------ tests/models/test_responses.py | 4 +-- tests/test_decoders.py | 28 +++++++-------- 3 files changed, 34 insertions(+), 63 deletions(-) diff --git a/httpx/_decoders.py b/httpx/_decoders.py index 69c03697db..a35c9471d3 100644 --- a/httpx/_decoders.py +++ b/httpx/_decoders.py @@ -266,57 +266,28 @@ def __init__(self) -> None: self.buffer = "" def decode(self, text: str) -> typing.List[str]: - lines = [] - - if text and self.buffer and self.buffer[-1] == "\r": - if text.startswith("\n"): - # Handle the case where we have an "\r\n" split across - # our previous input, and our new chunk. - lines.append(self.buffer[:-1] + "\n") - self.buffer = "" - text = text[1:] - else: - # Handle the case where we have "\r" at the end of our - # previous input. - lines.append(self.buffer[:-1] + "\n") - self.buffer = "" - - while text: - num_chars = len(text) - for idx in range(num_chars): - char = text[idx] - next_char = None if idx + 1 == num_chars else text[idx + 1] - if char == "\n": - lines.append(self.buffer + text[: idx + 1]) - self.buffer = "" - text = text[idx + 1 :] - break - elif char == "\r" and next_char == "\n": - lines.append(self.buffer + text[:idx] + "\n") - self.buffer = "" - text = text[idx + 2 :] - break - elif char == "\r" and next_char is not None: - lines.append(self.buffer + text[:idx] + "\n") - self.buffer = "" - text = text[idx + 1 :] - break - elif next_char is None: - self.buffer += text - text = "" - break + if self.buffer: + text = self.buffer + text + + if not text: + return [] + + lines = text.splitlines() + if text.endswith("\n"): + self.buffer = "" + else: + remainder = lines.pop() + # Handle the case where there is a CR, but we may need to wait for + # an LF in the next chunk to avoid emitting spurious lines. + if text.endswith("\r"): + remainder += "\r" + self.buffer = remainder return lines def flush(self) -> typing.List[str]: - if self.buffer.endswith("\r"): - # Handle the case where we had a trailing '\r', which could have - # been a '\r\n' pair. - lines = [self.buffer[:-1] + "\n"] - elif self.buffer: - lines = [self.buffer] - else: - lines = [] + # this handles stripping any trailng \r + lines = self.buffer.splitlines() self.buffer = "" return lines diff --git a/tests/models/test_responses.py b/tests/models/test_responses.py index 88d82cf9a7..aae7a78c46 100644 --- a/tests/models/test_responses.py +++ b/tests/models/test_responses.py @@ -639,7 +639,7 @@ def test_iter_lines(): content=b"Hello,\nworld!", ) content = [line for line in response.iter_lines()] - assert content == ["Hello,\n", "world!"] + assert content == ["Hello,", "world!"] @pytest.mark.asyncio @@ -652,7 +652,7 @@ async def test_aiter_lines(): content = [] async for line in response.aiter_lines(): content.append(line) - assert content == ["Hello,\n", "world!"] + assert content == ["Hello,", "world!"] def test_sync_streaming_response(): diff --git a/tests/test_decoders.py b/tests/test_decoders.py index 6767c3ef6c..65d3a4b98c 100644 --- a/tests/test_decoders.py +++ b/tests/test_decoders.py @@ -236,69 +236,69 @@ def test_text_decoder_empty_cases(): def test_line_decoder_nl(): decoder = LineDecoder() assert decoder.decode("") == [] - assert decoder.decode("a\n\nb\nc") == ["a\n", "\n", "b\n"] + assert decoder.decode("a\n\nb\nc") == ["a", "", "b"] assert decoder.flush() == ["c"] decoder = LineDecoder() assert decoder.decode("") == [] - assert decoder.decode("a\n\nb\nc\n") == ["a\n", "\n", "b\n", "c\n"] + assert decoder.decode("a\n\nb\nc\n") == ["a", "", "b", "c"] assert decoder.flush() == [] # Issue #1033 decoder = LineDecoder() assert decoder.decode("") == [] - assert decoder.decode("12345\n") == ["12345\n"] + assert decoder.decode("12345\n") == ["12345"] assert decoder.decode("foo ") == [] assert decoder.decode("bar ") == [] - assert decoder.decode("baz\n") == ["foo bar baz\n"] + assert decoder.decode("baz\n") == ["foo bar baz"] assert decoder.flush() == [] def test_line_decoder_cr(): decoder = LineDecoder() assert decoder.decode("") == [] - assert decoder.decode("a\r\rb\rc") == ["a\n", "\n", "b\n"] + assert decoder.decode("a\r\rb\rc") == ["a", "", "b"] assert decoder.flush() == ["c"] decoder = LineDecoder() assert decoder.decode("") == [] - assert decoder.decode("a\r\rb\rc\r") == ["a\n", "\n", "b\n"] - assert decoder.flush() == ["c\n"] + assert decoder.decode("a\r\rb\rc\r") == ["a", "", "b"] + assert decoder.flush() == ["c"] # Issue #1033 decoder = LineDecoder() assert decoder.decode("") == [] assert decoder.decode("12345\r") == [] - assert decoder.decode("foo ") == ["12345\n"] + assert decoder.decode("foo ") == ["12345"] assert decoder.decode("bar ") == [] assert decoder.decode("baz\r") == [] - assert decoder.flush() == ["foo bar baz\n"] + assert decoder.flush() == ["foo bar baz"] def test_line_decoder_crnl(): decoder = LineDecoder() assert decoder.decode("") == [] - assert decoder.decode("a\r\n\r\nb\r\nc") == ["a\n", "\n", "b\n"] + assert decoder.decode("a\r\n\r\nb\r\nc") == ["a", "", "b"] assert decoder.flush() == ["c"] decoder = LineDecoder() assert decoder.decode("") == [] - assert decoder.decode("a\r\n\r\nb\r\nc\r\n") == ["a\n", "\n", "b\n", "c\n"] + assert decoder.decode("a\r\n\r\nb\r\nc\r\n") == ["a", "", "b", "c"] assert decoder.flush() == [] decoder = LineDecoder() assert decoder.decode("") == [] assert decoder.decode("a\r") == [] - assert decoder.decode("\n\r\nb\r\nc") == ["a\n", "\n", "b\n"] + assert decoder.decode("\n\r\nb\r\nc") == ["a", "", "b"] assert decoder.flush() == ["c"] # Issue #1033 decoder = LineDecoder() assert decoder.decode("") == [] - assert decoder.decode("12345\r\n") == ["12345\n"] + assert decoder.decode("12345\r\n") == ["12345"] assert decoder.decode("foo ") == [] assert decoder.decode("bar ") == [] - assert decoder.decode("baz\r\n") == ["foo bar baz\n"] + assert decoder.decode("baz\r\n") == ["foo bar baz"] assert decoder.flush() == []