Use list_objects_v2 API for S3 (#302)

* Initial work * Fix drive-level exists checks * perf measurement * implement listing with list_objects_v2 * formatting * make tests pass * update makefile documentation * revert rig skipping * Update Makefile Co-authored-by: Jay Qi <2721979+jayqi@users.noreply.github.com> * extract out default containers and buckets * remove dotenv from runner * Move google import to avoid import issues * Add pip caching and test independent installs * missing run keys * skip slow oses to test new steps * add notification dependency * install build tools * activate env * Instantiate properties first * Allow azure to instantiate * oses back in and debug commands removed Co-authored-by: Jay Qi <2721979+jayqi@users.noreply.github.com>
drivendataorg · Dec 18, 2022 · 19f314b · 19f314b
1 parent 289baeb
commit 19f314b
Show file tree

Hide file tree

Showing 18 changed files with 524 additions and 54 deletions.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -18,9 +18,10 @@ jobs:
       - uses: actions/checkout@v2
 
       - name: Set up Python
-        uses: actions/setup-python@v2
+        uses: actions/setup-python@v4
         with:
           python-version: 3.8
+          cache: 'pip' # caching pip dependencies
 
       - name: Install dependencies
         run: |
@@ -44,9 +45,10 @@ jobs:
       - uses: actions/checkout@v2
 
       - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v2
+        uses: actions/setup-python@v4
         with:
           python-version: ${{ matrix.python-version }}
+          cache: 'pip' # caching pip dependencies
 
       - name: Install dependencies
         run: |
@@ -82,9 +84,10 @@ jobs:
       - uses: actions/checkout@v2
 
       - name: Set up Python
-        uses: actions/setup-python@v2
+        uses: actions/setup-python@v4
         with:
           python-version: 3.8
+          cache: 'pip' # caching pip dependencies
 
       - name: Install dependencies
         run: |
@@ -119,9 +122,54 @@ jobs:
           file: ./coverage.xml
           fail_ci_if_error: true
 
+  extras-test:
+    name: Test independent installs of clients
+    needs: tests
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        extra: ["s3", "azure", "gs"]
+        include:
+          - prefix: "s3"
+            extra: "s3"
+          - prefix: "az"
+            extra: "azure"
+          - prefix: "gs"
+            extra: "gs"
+
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.8
+          cache: "pip"
+
+      - name: Build cloudpathlib
+        run: |
+          pip install --upgrade pip wheel setuptools
+          make dist  # build cloudpathlib wheel
+
+      - name: Create empty venv 
+        run: |
+          python -m venv ${{ matrix.extra }}-env
+
+      - name: Install cloudpathlib[${{ matrix.extra }}]
+        run: |
+          source ${{ matrix.extra }}-env/bin/activate
+          pip install "$(find dist -name 'cloudpathlib*.whl')[${{ matrix.extra }}]"
+
+      - name: Test ${{ matrix.extra }} usage
+        run: |
+          source ${{ matrix.extra }}-env/bin/activate
+          python -c 'from cloudpathlib import CloudPath; CloudPath("${{ matrix.prefix }}://bucket/test")'
+        env:
+          AZURE_STORAGE_CONNECTION_STRING: ${{ secrets.AZURE_STORAGE_CONNECTION_STRING }}
+
   notify:
     name: Notify failed build
-    needs: [code-quality, tests, live-tests]
+    needs: [code-quality, tests, live-tests, extras-test]
     if: failure() && github.event.pull_request == null
     runs-on: ubuntu-latest
     steps:

diff --git a/.gitignore b/.gitignore
@@ -4,6 +4,9 @@ docs/docs/command-reference/*.md
 docs/docs/index.md
 docs/docs/changelog.md
 
+# perf output
+perf-results.csv
+
 ## GitHub Python .gitignore ##
 # https://github.com/github/gitignore/blob/master/Python.gitignore
 
@@ -76,6 +79,7 @@ target/
 
 # Jupyter Notebook
 .ipynb_checkpoints
+scratchpad.ipynb
 
 # pyenv
 .python-version
@@ -88,6 +92,7 @@ celerybeat-schedule
 
 # dotenv
 .env
+.gscreds.json
 
 # virtualenv
 .venv

diff --git a/Makefile b/Makefile
@@ -51,7 +51,7 @@ docs: clean-docs docs-setup ## build the static version of the docs
 docs-serve: clean-docs docs-setup ## serve documentation to livereload while you work
 	cd docs && mkdocs serve
 
-format:
+format:  ## run black to format codebase
 	black cloudpathlib tests docs
 
 help:
@@ -60,7 +60,7 @@ help:
 install: clean ## install the package to the active Python's site-packages
 	python setup.py install
 
-lint: ## check style with flake8
+lint: ## check style with black, flake8, and mypy
 	black --check cloudpathlib tests docs
 	flake8 cloudpathlib tests docs
 	mypy cloudpathlib
@@ -71,11 +71,17 @@ release: dist ## package and upload a release
 release-test: dist
 	twine upload --repository pypitest dist/*
 
-reqs:
+reqs:  ## install development requirements
 	pip install -U -r requirements-dev.txt
 
 test: ## run tests with mocked cloud SDKs
 	python -m pytest -vv
 
+test-debug:  ## rerun tests that failed in last run and stop with pdb at failures
+	python -m pytest -n=0 -vv --lf --pdb
+
 test-live-cloud:  ## run tests on live cloud backends
 	USE_LIVE_CLOUD=1 python -m pytest -vv
+
+perf:  ## run performance measurement suite for s3 and save results to perf-results.csv
+	python tests/performance/cli.py s3 --save-csv=perf-results.csv
diff --git a/cloudpathlib/azure/azblobclient.py b/cloudpathlib/azure/azblobclient.py
@@ -142,6 +142,10 @@ def _is_file_or_dir(self, cloud_path: AzureBlobPath) -> Optional[str]:
                 return None
 
     def _exists(self, cloud_path: AzureBlobPath) -> bool:
+        # short circuit when only the container
+        if not cloud_path.blob:
+            return self.service_client.get_container_client(cloud_path.container).exists()
+
         return self._is_file_or_dir(cloud_path) in ["file", "dir"]
 
     def _list_dir(

diff --git a/cloudpathlib/client.py b/cloudpathlib/client.py
@@ -31,9 +31,9 @@ def __init__(
         local_cache_dir: Optional[Union[str, os.PathLike]] = None,
         content_type_method: Optional[Callable] = mimetypes.guess_type,
     ):
+        self._cache_tmp_dir = None
         self._cloud_meta.validate_completeness()
         # setup caching and local versions of file and track if it is a tmp dir
-        self._cache_tmp_dir = None
         if local_cache_dir is None:
             self._cache_tmp_dir = TemporaryDirectory()
             local_cache_dir = self._cache_tmp_dir.name

diff --git a/cloudpathlib/gs/gsclient.py b/cloudpathlib/gs/gsclient.py
@@ -4,6 +4,7 @@
 from pathlib import Path, PurePosixPath
 from typing import Any, Callable, Dict, Iterable, Optional, TYPE_CHECKING, Tuple, Union
 
+
 from ..client import Client, register_client_class
 from ..cloudpath import implementation_registry
 from .gspath import GSPath
@@ -12,6 +13,7 @@
     if TYPE_CHECKING:
         from google.auth.credentials import Credentials
 
+    from google.api_core.exceptions import NotFound
     from google.auth.exceptions import DefaultCredentialsError
     from google.cloud.storage import Client as StorageClient
 
@@ -135,6 +137,14 @@ def _is_file_or_dir(self, cloud_path: GSPath) -> Optional[str]:
                 return None
 
     def _exists(self, cloud_path: GSPath) -> bool:
+        # short-circuit the root-level bucket
+        if not cloud_path.blob:
+            try:
+                next(self.client.bucket(cloud_path.bucket).list_blobs())
+                return True
+            except NotFound:
+                return False
+
         return self._is_file_or_dir(cloud_path) in ["file", "dir"]
 
     def _list_dir(self, cloud_path: GSPath, recursive=False) -> Iterable[Tuple[GSPath, bool]]:

diff --git a/cloudpathlib/s3/s3client.py b/cloudpathlib/s3/s3client.py
@@ -143,12 +143,19 @@ def _is_file_or_dir(self, cloud_path: S3Path) -> Optional[str]:
             return "dir"
 
     def _exists(self, cloud_path: S3Path) -> bool:
+        # check if this is a bucket
+        if not cloud_path.key:
+            try:
+                self.client.head_bucket(Bucket=cloud_path.bucket)
+                return True
+            except ClientError:
+                return False
+
         return self._s3_file_query(cloud_path) is not None
 
     def _s3_file_query(self, cloud_path: S3Path):
         """Boto3 query used for quick checks of existence and if path is file/dir"""
-        # first check if this is an object that we can access directly
-
+        # check if this is an object that we can access directly
         try:
             obj = self.s3.Object(cloud_path.bucket, cloud_path.key)
             obj.load()
@@ -169,40 +176,55 @@ def _s3_file_query(self, cloud_path: S3Path):
             )
 
     def _list_dir(self, cloud_path: S3Path, recursive=False) -> Iterable[Tuple[S3Path, bool]]:
-        bucket = self.s3.Bucket(cloud_path.bucket)
-
         prefix = cloud_path.key
         if prefix and not prefix.endswith("/"):
             prefix += "/"
 
         yielded_dirs = set()
 
-        if recursive:
-            for o in bucket.objects.filter(Prefix=prefix):
-                # get directory from this path
-                for parent in PurePosixPath(o.key[len(prefix) :]).parents:
-                    # if we haven't surfaced their directory already
-                    if parent not in yielded_dirs and str(parent) != ".":
-                        yield (self.CloudPath(f"s3://{cloud_path.bucket}/{prefix}{parent}"), True)
-                        yielded_dirs.add(parent)
+        paginator = self.client.get_paginator("list_objects_v2")
 
-                yield (self.CloudPath(f"s3://{o.bucket_name}/{o.key}"), False)
-        else:
-            # non recursive is best done with old client API rather than resource
-            paginator = self.client.get_paginator("list_objects")
-
-            for result in paginator.paginate(
-                Bucket=cloud_path.bucket, Prefix=prefix, Delimiter="/"
-            ):
-                # sub directory names
-                for result_prefix in result.get("CommonPrefixes", []):
+        for result in paginator.paginate(
+            Bucket=cloud_path.bucket, Prefix=prefix, Delimiter=("" if recursive else "/")
+        ):
+            # yield everything in common prefixes as directories
+            for result_prefix in result.get("CommonPrefixes", []):
+                canonical = result_prefix.get("Prefix").rstrip("/")  # keep a canonical form
+                if canonical not in yielded_dirs:
+                    yield (
+                        self.CloudPath(f"s3://{cloud_path.bucket}/{canonical}"),
+                        True,
+                    )
+                    yielded_dirs.add(canonical)
+
+            # check all the keys
+            for result_key in result.get("Contents", []):
+                # yield all the parents of any key that have not been yielded already
+                o_relative_path = result_key.get("Key")[len(prefix) :]
+                for parent in PurePosixPath(o_relative_path).parents:
+                    parent_canonical = prefix + str(parent).rstrip("/")
+                    if parent_canonical not in yielded_dirs and str(parent) != ".":
+                        yield (
+                            self.CloudPath(f"s3://{cloud_path.bucket}/{parent_canonical}"),
+                            True,
+                        )
+                        yielded_dirs.add(parent_canonical)
+
+                # if we already yielded this dir, go to next item in contents
+                canonical = result_key.get("Key").rstrip("/")
+                if canonical in yielded_dirs:
+                    continue
+
+                # s3 fake directories have 0 size and end with "/"
+                if result_key.get("Key").endswith("/") and result_key.get("Size") == 0:
                     yield (
-                        self.CloudPath(f"s3://{cloud_path.bucket}/{result_prefix.get('Prefix')}"),
+                        self.CloudPath(f"s3://{cloud_path.bucket}/{canonical}"),
                         True,
                     )
+                    yielded_dirs.add(canonical)
 
-                # files in the directory
-                for result_key in result.get("Contents", []):
+                # yield object as file
+                else:
                     yield (
                         self.CloudPath(f"s3://{cloud_path.bucket}/{result_key.get('Key')}"),
                         False,

diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -5,6 +5,7 @@ flake8
 ipytest
 ipython
 jupyter
+loguru
 matplotlib
 mike
 mkdocs>=1.2.2
@@ -19,8 +20,12 @@ pydantic
 pytest
 pytest-cases
 pytest-cov
+pytest-xdist
 python-dotenv
 pywin32; sys_platform == 'win32'
+rich
 shortuuid
 tabulate
+tqdm
+typer
 wheel
diff --git a/setup.cfg b/setup.cfg
@@ -21,7 +21,7 @@ ignore_errors = True
 
 [tool:pytest]
 testpaths = tests/
-addopts = --cov=cloudpathlib --cov-report=term --cov-report=html --cov-report=xml
+addopts = --cov=cloudpathlib --cov-report=term --cov-report=html --cov-report=xml -n=auto
 
 [coverage:report]
 include = cloudpathlib/**.py