From 4cd61eb08fa8efc335778bc59c996a00c43ba93e Mon Sep 17 00:00:00 2001
From: Aleksandar Mastilovic <amastilovic@wikimedia.org>
Date: Thu, 10 Jul 2025 15:11:49 -0700
Subject: [PATCH 1/2] Fix HTTPFileSystem isdir downloads the whole file issue

Method _ls_real tries to download the whole r.text() of a link
regardless of the type of HTML content. Prevent this download in all
cases except when Content-Type header is not set, or it is set to text/html
---
 fsspec/implementations/http.py | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/fsspec/implementations/http.py b/fsspec/implementations/http.py
index 0504e2968..d3643ae62 100644
--- a/fsspec/implementations/http.py
+++ b/fsspec/implementations/http.py
@@ -158,14 +158,20 @@ async def _ls_real(self, url, detail=True, **kwargs):
         session = await self.set_session()
         async with session.get(self.encode_url(url), **self.kwargs) as r:
             self._raise_not_found_for_status(r, url)
-            try:
-                text = await r.text()
-                if self.simple_links:
-                    links = ex2.findall(text) + [u[2] for u in ex.findall(text)]
-                else:
-                    links = [u[2] for u in ex.findall(text)]
-            except UnicodeDecodeError:
-                links = []  # binary, not HTML
+            url_info = await self._info(url, **kwargs)
+            mimetype = url_info.get("mimetype", None)
+            if mimetype in ("text/html", None):
+                try:
+                    text = await r.text()
+                    if self.simple_links:
+                        links = ex2.findall(text) + [u[2] for u in ex.findall(text)]
+                    else:
+                        links = [u[2] for u in ex.findall(text)]
+                except UnicodeDecodeError:
+                    links = []  # binary, not HTML
+            else:
+                links = []
+
         out = set()
         parts = urlparse(url)
         for l in links:

From 49b6e904d56eae413473bcc1359115d39694cbe7 Mon Sep 17 00:00:00 2001
From: Aleksandar Mastilovic <amastilovic@wikimedia.org>
Date: Fri, 11 Jul 2025 10:55:53 -0700
Subject: [PATCH 2/2] Address review comments, add unit test case

* Get Content-Type from headers instead of another `.info()` call
* Use `r.text(errors="ignore")`
* Add a `test_isdir` case for when MIME type is present
---
 fsspec/implementations/http.py            | 10 +++++++---
 fsspec/implementations/tests/test_http.py |  5 +++++
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/fsspec/implementations/http.py b/fsspec/implementations/http.py
index d3643ae62..ea8d79d46 100644
--- a/fsspec/implementations/http.py
+++ b/fsspec/implementations/http.py
@@ -158,11 +158,15 @@ async def _ls_real(self, url, detail=True, **kwargs):
         session = await self.set_session()
         async with session.get(self.encode_url(url), **self.kwargs) as r:
             self._raise_not_found_for_status(r, url)
-            url_info = await self._info(url, **kwargs)
-            mimetype = url_info.get("mimetype", None)
+
+            if "Content-Type" in r.headers:
+                mimetype = r.headers["Content-Type"].partition(";")[0]
+            else:
+                mimetype = None
+
             if mimetype in ("text/html", None):
                 try:
-                    text = await r.text()
+                    text = await r.text(errors="ignore")
                     if self.simple_links:
                         links = ex2.findall(text) + [u[2] for u in ex.findall(text)]
                     else:
diff --git a/fsspec/implementations/tests/test_http.py b/fsspec/implementations/tests/test_http.py
index abefbce0a..d014d1155 100644
--- a/fsspec/implementations/tests/test_http.py
+++ b/fsspec/implementations/tests/test_http.py
@@ -139,6 +139,11 @@ def test_glob_return_subfolders(server):
 
 
 def test_isdir(server):
+    h = fsspec.filesystem("http", headers={"give_mimetype": "true"})
+    assert h.isdir(server.address + "/index/")
+    assert not h.isdir(server.realfile)
+    assert not h.isdir(server.address + "doesnotevenexist")
+
     h = fsspec.filesystem("http")
     assert h.isdir(server.address + "/index/")
     assert not h.isdir(server.realfile)