diff --git a/fsspec/implementations/http.py b/fsspec/implementations/http.py index 0504e2968..ea8d79d46 100644 --- a/fsspec/implementations/http.py +++ b/fsspec/implementations/http.py @@ -158,14 +158,24 @@ async def _ls_real(self, url, detail=True, **kwargs): session = await self.set_session() async with session.get(self.encode_url(url), **self.kwargs) as r: self._raise_not_found_for_status(r, url) - try: - text = await r.text() - if self.simple_links: - links = ex2.findall(text) + [u[2] for u in ex.findall(text)] - else: - links = [u[2] for u in ex.findall(text)] - except UnicodeDecodeError: - links = [] # binary, not HTML + + if "Content-Type" in r.headers: + mimetype = r.headers["Content-Type"].partition(";")[0] + else: + mimetype = None + + if mimetype in ("text/html", None): + try: + text = await r.text(errors="ignore") + if self.simple_links: + links = ex2.findall(text) + [u[2] for u in ex.findall(text)] + else: + links = [u[2] for u in ex.findall(text)] + except UnicodeDecodeError: + links = [] # binary, not HTML + else: + links = [] + out = set() parts = urlparse(url) for l in links: diff --git a/fsspec/implementations/tests/test_http.py b/fsspec/implementations/tests/test_http.py index abefbce0a..d014d1155 100644 --- a/fsspec/implementations/tests/test_http.py +++ b/fsspec/implementations/tests/test_http.py @@ -139,6 +139,11 @@ def test_glob_return_subfolders(server): def test_isdir(server): + h = fsspec.filesystem("http", headers={"give_mimetype": "true"}) + assert h.isdir(server.address + "/index/") + assert not h.isdir(server.realfile) + assert not h.isdir(server.address + "doesnotevenexist") + h = fsspec.filesystem("http") assert h.isdir(server.address + "/index/") assert not h.isdir(server.realfile)