From 4cd61eb08fa8efc335778bc59c996a00c43ba93e Mon Sep 17 00:00:00 2001 From: Aleksandar Mastilovic Date: Thu, 10 Jul 2025 15:11:49 -0700 Subject: [PATCH 1/2] Fix HTTPFileSystem isdir downloads the whole file issue Method _ls_real tries to download the whole r.text() of a link regardless of the type of HTML content. Prevent this download in all cases except when Content-Type header is not set, or it is set to text/html --- fsspec/implementations/http.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/fsspec/implementations/http.py b/fsspec/implementations/http.py index 0504e2968..d3643ae62 100644 --- a/fsspec/implementations/http.py +++ b/fsspec/implementations/http.py @@ -158,14 +158,20 @@ async def _ls_real(self, url, detail=True, **kwargs): session = await self.set_session() async with session.get(self.encode_url(url), **self.kwargs) as r: self._raise_not_found_for_status(r, url) - try: - text = await r.text() - if self.simple_links: - links = ex2.findall(text) + [u[2] for u in ex.findall(text)] - else: - links = [u[2] for u in ex.findall(text)] - except UnicodeDecodeError: - links = [] # binary, not HTML + url_info = await self._info(url, **kwargs) + mimetype = url_info.get("mimetype", None) + if mimetype in ("text/html", None): + try: + text = await r.text() + if self.simple_links: + links = ex2.findall(text) + [u[2] for u in ex.findall(text)] + else: + links = [u[2] for u in ex.findall(text)] + except UnicodeDecodeError: + links = [] # binary, not HTML + else: + links = [] + out = set() parts = urlparse(url) for l in links: From 49b6e904d56eae413473bcc1359115d39694cbe7 Mon Sep 17 00:00:00 2001 From: Aleksandar Mastilovic Date: Fri, 11 Jul 2025 10:55:53 -0700 Subject: [PATCH 2/2] Address review comments, add unit test case * Get Content-Type from headers instead of another `.info()` call * Use `r.text(errors="ignore")` * Add a `test_isdir` case for when MIME type is present --- fsspec/implementations/http.py | 10 +++++++--- fsspec/implementations/tests/test_http.py | 5 +++++ 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/fsspec/implementations/http.py b/fsspec/implementations/http.py index d3643ae62..ea8d79d46 100644 --- a/fsspec/implementations/http.py +++ b/fsspec/implementations/http.py @@ -158,11 +158,15 @@ async def _ls_real(self, url, detail=True, **kwargs): session = await self.set_session() async with session.get(self.encode_url(url), **self.kwargs) as r: self._raise_not_found_for_status(r, url) - url_info = await self._info(url, **kwargs) - mimetype = url_info.get("mimetype", None) + + if "Content-Type" in r.headers: + mimetype = r.headers["Content-Type"].partition(";")[0] + else: + mimetype = None + if mimetype in ("text/html", None): try: - text = await r.text() + text = await r.text(errors="ignore") if self.simple_links: links = ex2.findall(text) + [u[2] for u in ex.findall(text)] else: diff --git a/fsspec/implementations/tests/test_http.py b/fsspec/implementations/tests/test_http.py index abefbce0a..d014d1155 100644 --- a/fsspec/implementations/tests/test_http.py +++ b/fsspec/implementations/tests/test_http.py @@ -139,6 +139,11 @@ def test_glob_return_subfolders(server): def test_isdir(server): + h = fsspec.filesystem("http", headers={"give_mimetype": "true"}) + assert h.isdir(server.address + "/index/") + assert not h.isdir(server.realfile) + assert not h.isdir(server.address + "doesnotevenexist") + h = fsspec.filesystem("http") assert h.isdir(server.address + "/index/") assert not h.isdir(server.realfile)