drivendataorg · pjbull · Dec 18, 2022 · Dec 4, 2022 · Dec 4, 2022 · Dec 17, 2022
diff --git a/.gitignore b/.gitignore
@@ -4,6 +4,9 @@ docs/docs/command-reference/*.md
 docs/docs/index.md
 docs/docs/changelog.md
 
+# perf output
+perf-results.csv
+
 ## GitHub Python .gitignore ##
 # https://github.com/github/gitignore/blob/master/Python.gitignore
 
@@ -76,6 +79,7 @@ target/
 
 # Jupyter Notebook
 .ipynb_checkpoints
+scratchpad.ipynb
 
 # pyenv
 .python-version
@@ -88,6 +92,7 @@ celerybeat-schedule
 
 # dotenv
 .env
+.gscreds.json
 
 # virtualenv
 .venv

diff --git a/Makefile b/Makefile
@@ -51,7 +51,7 @@ docs: clean-docs docs-setup ## build the static version of the docs
 docs-serve: clean-docs docs-setup ## serve documentation to livereload while you work
 	cd docs && mkdocs serve
 
-format:
+format:  ## run blank for format codebase
 	black cloudpathlib tests docs
 
 help:
@@ -60,7 +60,7 @@ help:
 install: clean ## install the package to the active Python's site-packages
 	python setup.py install
 
-lint: ## check style with flake8
+lint: ## check style with black, flake8, and mypy
 	black --check cloudpathlib tests docs
 	flake8 cloudpathlib tests docs
 	mypy cloudpathlib
@@ -71,11 +71,17 @@ release: dist ## package and upload a release
 release-test: dist
 	twine upload --repository pypitest dist/*
 
-reqs:
+reqs:  ## install development requirements
 	pip install -U -r requirements-dev.txt
 
 test: ## run tests with mocked cloud SDKs
 	python -m pytest -vv
 
+test-debug:  ## rerun tests that failed in last run and stop with pdb at failures
+	python -m pytest -n=0 -vv --lf --pdb
+
 test-live-cloud:  ## run tests on live cloud backends
 	USE_LIVE_CLOUD=1 python -m pytest -vv
+
+perf:  ## run performance measurement suite for s3 and save results to perf-results.csv
+	python tests/performance/cli.py s3 --save-csv=perf-results.csv
diff --git a/cloudpathlib/azure/azblobclient.py b/cloudpathlib/azure/azblobclient.py
@@ -142,6 +142,10 @@ def _is_file_or_dir(self, cloud_path: AzureBlobPath) -> Optional[str]:
                 return None
 
     def _exists(self, cloud_path: AzureBlobPath) -> bool:
+        # short circuit when only the container
+        if not cloud_path.blob:
+            return self.service_client.get_container_client(cloud_path.container).exists()
+
         return self._is_file_or_dir(cloud_path) in ["file", "dir"]
 
     def _list_dir(

diff --git a/cloudpathlib/gs/gsclient.py b/cloudpathlib/gs/gsclient.py
@@ -4,6 +4,8 @@
 from pathlib import Path, PurePosixPath
 from typing import Any, Callable, Dict, Iterable, Optional, TYPE_CHECKING, Tuple, Union
 
+from google.api_core.exceptions import NotFound
+
 from ..client import Client, register_client_class
 from ..cloudpath import implementation_registry
 from .gspath import GSPath
@@ -135,6 +137,14 @@ def _is_file_or_dir(self, cloud_path: GSPath) -> Optional[str]:
                 return None
 
     def _exists(self, cloud_path: GSPath) -> bool:
+        # short-circuit the root-level bucket
+        if not cloud_path.blob:
+            try:
+                next(self.client.bucket(cloud_path.bucket).list_blobs())
+                return True
+            except NotFound:
+                return False
+
         return self._is_file_or_dir(cloud_path) in ["file", "dir"]
 
     def _list_dir(self, cloud_path: GSPath, recursive=False) -> Iterable[Tuple[GSPath, bool]]:

diff --git a/cloudpathlib/s3/s3client.py b/cloudpathlib/s3/s3client.py
@@ -143,12 +143,19 @@ def _is_file_or_dir(self, cloud_path: S3Path) -> Optional[str]:
             return "dir"
 
     def _exists(self, cloud_path: S3Path) -> bool:
+        # check if this is a bucket
+        if not cloud_path.key:
+            try:
+                self.client.head_bucket(Bucket=cloud_path.bucket)
+                return True
+            except ClientError:
+                return False
+
         return self._s3_file_query(cloud_path) is not None
 
     def _s3_file_query(self, cloud_path: S3Path):
         """Boto3 query used for quick checks of existence and if path is file/dir"""
-        # first check if this is an object that we can access directly
-
+        # check if this is an object that we can access directly
         try:
             obj = self.s3.Object(cloud_path.bucket, cloud_path.key)
             obj.load()
@@ -169,40 +176,55 @@ def _s3_file_query(self, cloud_path: S3Path):
             )
 
     def _list_dir(self, cloud_path: S3Path, recursive=False) -> Iterable[Tuple[S3Path, bool]]:
-        bucket = self.s3.Bucket(cloud_path.bucket)
-
         prefix = cloud_path.key
         if prefix and not prefix.endswith("/"):
             prefix += "/"
 
         yielded_dirs = set()
 
-        if recursive:
-            for o in bucket.objects.filter(Prefix=prefix):
-                # get directory from this path
-                for parent in PurePosixPath(o.key[len(prefix) :]).parents:
-                    # if we haven't surfaced their directory already
-                    if parent not in yielded_dirs and str(parent) != ".":
-                        yield (self.CloudPath(f"s3://{cloud_path.bucket}/{prefix}{parent}"), True)
-                        yielded_dirs.add(parent)
+        paginator = self.client.get_paginator("list_objects_v2")
 
-                yield (self.CloudPath(f"s3://{o.bucket_name}/{o.key}"), False)
-        else:
-            # non recursive is best done with old client API rather than resource
-            paginator = self.client.get_paginator("list_objects")
-
-            for result in paginator.paginate(
-                Bucket=cloud_path.bucket, Prefix=prefix, Delimiter="/"
-            ):
-                # sub directory names
-                for result_prefix in result.get("CommonPrefixes", []):
+        for result in paginator.paginate(
+            Bucket=cloud_path.bucket, Prefix=prefix, Delimiter=("" if recursive else "/")
+        ):
+            # yield everything in common prefixes as directories
+            for result_prefix in result.get("CommonPrefixes", []):
+                canonical = result_prefix.get("Prefix").rstrip("/")  # keep a canonical form
+                if canonical not in yielded_dirs:
+                    yield (
+                        self.CloudPath(f"s3://{cloud_path.bucket}/{canonical}"),
+                        True,
+                    )
+                    yielded_dirs.add(canonical)
+
+            # check all the keys
+            for result_key in result.get("Contents", []):
+                # yield all the parents of any key that have not been yielded already
+                o_relative_path = result_key.get("Key")[len(prefix) :]
+                for parent in PurePosixPath(o_relative_path).parents:
+                    parent_canonical = prefix + str(parent).rstrip("/")
+                    if parent_canonical not in yielded_dirs and str(parent) != ".":
+                        yield (
+                            self.CloudPath(f"s3://{cloud_path.bucket}/{parent_canonical}"),
+                            True,
+                        )
+                        yielded_dirs.add(parent_canonical)
+
+                # if we already yielded this dir, go to next item in contents
+                canonical = result_key.get("Key").rstrip("/")
+                if canonical in yielded_dirs:
+                    continue
+
+                # s3 fake directories have 0 size and end with "/"
+                if result_key.get("Key").endswith("/") and result_key.get("Size") == 0:
                     yield (
-                        self.CloudPath(f"s3://{cloud_path.bucket}/{result_prefix.get('Prefix')}"),
+                        self.CloudPath(f"s3://{cloud_path.bucket}/{canonical}"),
                         True,
                     )
+                    yielded_dirs.add(canonical)
 
-                # files in the directory
-                for result_key in result.get("Contents", []):
+                # yield object as file
+                else:
                     yield (
                         self.CloudPath(f"s3://{cloud_path.bucket}/{result_key.get('Key')}"),
                         False,

diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -5,6 +5,7 @@ flake8
 ipytest
 ipython
 jupyter
+loguru
 matplotlib
 mike
 mkdocs>=1.2.2
@@ -19,8 +20,12 @@ pydantic
 pytest
 pytest-cases
 pytest-cov
+pytest-xdist
 python-dotenv
 pywin32; sys_platform == 'win32'
+rich
 shortuuid
 tabulate
+tqdm
+typer
 wheel
diff --git a/setup.cfg b/setup.cfg
@@ -21,7 +21,7 @@ ignore_errors = True
 
 [tool:pytest]
 testpaths = tests/
-addopts = --cov=cloudpathlib --cov-report=term --cov-report=html --cov-report=xml
+addopts = --cov=cloudpathlib --cov-report=term --cov-report=html --cov-report=xml -n=auto
 
 [coverage:report]
 include = cloudpathlib/**.py
diff --git a/tests/mock_clients/mock_azureblob.py b/tests/mock_clients/mock_azureblob.py
@@ -33,7 +33,7 @@ def get_blob_client(self, container, blob):
             return MockBlobClient(self.tmp_path, blob, service_client=self)
 
         def get_container_client(self, container):
-            return MockContainerClient(self.tmp_path)
+            return MockContainerClient(self.tmp_path, container_name=container)
 
     return MockBlobServiceClient
 
@@ -106,8 +106,15 @@ def content_as_bytes(self):
 
 
 class MockContainerClient:
-    def __init__(self, root):
+    def __init__(self, root, container_name):
         self.root = root
+        self.container_name = container_name
+
+    def exists(self):
+        if self.container_name == "container":  # name used by passing tests
+            return True
+        else:
+            return False
 
     def list_blobs(self, name_starts_with=None):
         return mock_item_paged(self.root, name_starts_with)

diff --git a/tests/mock_clients/mock_gs.py b/tests/mock_clients/mock_gs.py
@@ -3,6 +3,8 @@
 import shutil
 from tempfile import TemporaryDirectory
 
+from google.api_core.exceptions import NotFound
+
 from .utils import delete_empty_parents_up_to_root
 
 TEST_ASSETS = Path(__file__).parent.parent / "assets"
@@ -30,7 +32,7 @@ def __del__(self):
             self.tmp.cleanup()
 
         def bucket(self, bucket):
-            return MockBucket(self.tmp_path, client=self)
+            return MockBucket(self.tmp_path, bucket, client=self)
 
     return MockClient
 
@@ -87,8 +89,9 @@ def content_type(self):
 
 
 class MockBucket:
-    def __init__(self, name, client=None):
+    def __init__(self, name, bucket_name, client=None):
         self.name = name
+        self.bucket_name = bucket_name
         self.client = client
 
     def blob(self, blob):
@@ -113,7 +116,14 @@ def list_blobs(self, max_results=None, prefix=None):
             for f in path.glob("**/*")
             if f.is_file() and not f.name.startswith(".")
         ]
-        return MockHTTPIterator(items, max_results)
+
+        # bucket name for passing tests
+        if self.bucket_name == "bucket":
+            return iter(MockHTTPIterator(items, max_results))
+        else:
+            raise NotFound(
+                f"Bucket {self.name} not expected as mock bucket; only 'bucket' exists."
+            )
 
 
 class MockHTTPIterator:

diff --git a/tests/mock_clients/mock_s3.py b/tests/mock_clients/mock_s3.py
@@ -199,6 +199,19 @@ def __init__(self, root, session=None):
     def get_paginator(self, api):
         return MockBoto3Paginator(self.root, session=self.session)
 
+    def head_bucket(self, Bucket):
+        if Bucket == "bucket":  # used in passing tests
+            return {"Bucket": Bucket}
+        else:
+            raise ClientError(
+                {
+                    "Error": {
+                        "Message": f"Bucket {Bucket} not expected as mock bucket; only 'bucket' exists."
+                    }
+                },
+                {},
+            )
+
     @property
     def exceptions(self):
         Ex = collections.namedtuple("Ex", "NoSuchKey")
@@ -214,14 +227,33 @@ def __init__(self, root, per_page=2, session=None):
     def paginate(self, Bucket=None, Prefix="", Delimiter=None):
         new_dir = self.root / Prefix
 
-        items = [f for f in new_dir.iterdir() if not f.name.startswith(".")]
+        if Delimiter == "/":
+            items = [f for f in new_dir.iterdir() if not f.name.startswith(".")]
+        else:
+            items = [f for f in new_dir.rglob("*") if not f.name.startswith(".")]
 
         for ix in range(0, len(items), self.per_page):
             page = items[ix : ix + self.per_page]
             dirs = [
                 {"Prefix": str(_.relative_to(self.root).as_posix())} for _ in page if _.is_dir()
             ]
             files = [
-                {"Key": str(_.relative_to(self.root).as_posix())} for _ in page if _.is_file()
+                {
+                    "Key": str(_.relative_to(self.root).as_posix()),
+                    "Size": 123
+                    if not _.relative_to(self.root).exists()
+                    else _.relative_to(self.root).stat().st_size,
+                }
+                for _ in page
+                if _.is_file()
             ]
+
+            # s3 can have "fake" directories where size is 0, but it is listed in "Contents" (see #198)
+            # add one in here for testing
+            if dirs:
+                fake_dir = dirs.pop(-1)
+                fake_dir["Size"] = 0
+                fake_dir["Key"] = fake_dir.pop("Prefix") + "/"  # fake dirs have '/' appended
+                files.append(fake_dir)
+
             yield {"CommonPrefixes": dirs, "Contents": files}
diff --git a/tests/performance/__init__.py b/tests/performance/__init__.py