Skip to content

Commit

Permalink
Use list_objects_v2 API for S3 (#302)
Browse files Browse the repository at this point in the history
* Initial work

* Fix drive-level exists checks

* perf measurement

* implement listing with list_objects_v2

* formatting

* make tests pass

* update makefile documentation

* revert rig skipping

* Update Makefile

Co-authored-by: Jay Qi <2721979+jayqi@users.noreply.github.com>

* extract out default containers and buckets

* remove dotenv from runner

* Move google import to avoid import issues

* Add pip caching and test independent installs

* missing run keys

* skip slow oses to test new steps

* add notification dependency

* install build tools

* activate env

* Instantiate properties first

* Allow azure to instantiate

* oses back in and debug commands removed

Co-authored-by: Jay Qi <2721979+jayqi@users.noreply.github.com>
  • Loading branch information
pjbull and jayqi committed Dec 18, 2022
1 parent 289baeb commit 19f314b
Show file tree
Hide file tree
Showing 18 changed files with 524 additions and 54 deletions.
56 changes: 52 additions & 4 deletions .github/workflows/tests.yml
Expand Up @@ -18,9 +18,10 @@ jobs:
- uses: actions/checkout@v2

- name: Set up Python
uses: actions/setup-python@v2
uses: actions/setup-python@v4
with:
python-version: 3.8
cache: 'pip' # caching pip dependencies

- name: Install dependencies
run: |
Expand All @@ -44,9 +45,10 @@ jobs:
- uses: actions/checkout@v2

- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
cache: 'pip' # caching pip dependencies

- name: Install dependencies
run: |
Expand Down Expand Up @@ -82,9 +84,10 @@ jobs:
- uses: actions/checkout@v2

- name: Set up Python
uses: actions/setup-python@v2
uses: actions/setup-python@v4
with:
python-version: 3.8
cache: 'pip' # caching pip dependencies

- name: Install dependencies
run: |
Expand Down Expand Up @@ -119,9 +122,54 @@ jobs:
file: ./coverage.xml
fail_ci_if_error: true

extras-test:
name: Test independent installs of clients
needs: tests
runs-on: ubuntu-latest
strategy:
matrix:
extra: ["s3", "azure", "gs"]
include:
- prefix: "s3"
extra: "s3"
- prefix: "az"
extra: "azure"
- prefix: "gs"
extra: "gs"

steps:
- uses: actions/checkout@v2

- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: 3.8
cache: "pip"

- name: Build cloudpathlib
run: |
pip install --upgrade pip wheel setuptools
make dist # build cloudpathlib wheel
- name: Create empty venv
run: |
python -m venv ${{ matrix.extra }}-env
- name: Install cloudpathlib[${{ matrix.extra }}]
run: |
source ${{ matrix.extra }}-env/bin/activate
pip install "$(find dist -name 'cloudpathlib*.whl')[${{ matrix.extra }}]"
- name: Test ${{ matrix.extra }} usage
run: |
source ${{ matrix.extra }}-env/bin/activate
python -c 'from cloudpathlib import CloudPath; CloudPath("${{ matrix.prefix }}://bucket/test")'
env:
AZURE_STORAGE_CONNECTION_STRING: ${{ secrets.AZURE_STORAGE_CONNECTION_STRING }}

notify:
name: Notify failed build
needs: [code-quality, tests, live-tests]
needs: [code-quality, tests, live-tests, extras-test]
if: failure() && github.event.pull_request == null
runs-on: ubuntu-latest
steps:
Expand Down
5 changes: 5 additions & 0 deletions .gitignore
Expand Up @@ -4,6 +4,9 @@ docs/docs/command-reference/*.md
docs/docs/index.md
docs/docs/changelog.md

# perf output
perf-results.csv

## GitHub Python .gitignore ##
# https://github.com/github/gitignore/blob/master/Python.gitignore

Expand Down Expand Up @@ -76,6 +79,7 @@ target/

# Jupyter Notebook
.ipynb_checkpoints
scratchpad.ipynb

# pyenv
.python-version
Expand All @@ -88,6 +92,7 @@ celerybeat-schedule

# dotenv
.env
.gscreds.json

# virtualenv
.venv
Expand Down
12 changes: 9 additions & 3 deletions Makefile
Expand Up @@ -51,7 +51,7 @@ docs: clean-docs docs-setup ## build the static version of the docs
docs-serve: clean-docs docs-setup ## serve documentation to livereload while you work
cd docs && mkdocs serve

format:
format: ## run black to format codebase
black cloudpathlib tests docs

help:
Expand All @@ -60,7 +60,7 @@ help:
install: clean ## install the package to the active Python's site-packages
python setup.py install

lint: ## check style with flake8
lint: ## check style with black, flake8, and mypy
black --check cloudpathlib tests docs
flake8 cloudpathlib tests docs
mypy cloudpathlib
Expand All @@ -71,11 +71,17 @@ release: dist ## package and upload a release
release-test: dist
twine upload --repository pypitest dist/*

reqs:
reqs: ## install development requirements
pip install -U -r requirements-dev.txt

test: ## run tests with mocked cloud SDKs
python -m pytest -vv

test-debug: ## rerun tests that failed in last run and stop with pdb at failures
python -m pytest -n=0 -vv --lf --pdb

test-live-cloud: ## run tests on live cloud backends
USE_LIVE_CLOUD=1 python -m pytest -vv

perf: ## run performance measurement suite for s3 and save results to perf-results.csv
python tests/performance/cli.py s3 --save-csv=perf-results.csv
4 changes: 4 additions & 0 deletions cloudpathlib/azure/azblobclient.py
Expand Up @@ -142,6 +142,10 @@ def _is_file_or_dir(self, cloud_path: AzureBlobPath) -> Optional[str]:
return None

def _exists(self, cloud_path: AzureBlobPath) -> bool:
# short circuit when only the container
if not cloud_path.blob:
return self.service_client.get_container_client(cloud_path.container).exists()

return self._is_file_or_dir(cloud_path) in ["file", "dir"]

def _list_dir(
Expand Down
2 changes: 1 addition & 1 deletion cloudpathlib/client.py
Expand Up @@ -31,9 +31,9 @@ def __init__(
local_cache_dir: Optional[Union[str, os.PathLike]] = None,
content_type_method: Optional[Callable] = mimetypes.guess_type,
):
self._cache_tmp_dir = None
self._cloud_meta.validate_completeness()
# setup caching and local versions of file and track if it is a tmp dir
self._cache_tmp_dir = None
if local_cache_dir is None:
self._cache_tmp_dir = TemporaryDirectory()
local_cache_dir = self._cache_tmp_dir.name
Expand Down
10 changes: 10 additions & 0 deletions cloudpathlib/gs/gsclient.py
Expand Up @@ -4,6 +4,7 @@
from pathlib import Path, PurePosixPath
from typing import Any, Callable, Dict, Iterable, Optional, TYPE_CHECKING, Tuple, Union


from ..client import Client, register_client_class
from ..cloudpath import implementation_registry
from .gspath import GSPath
Expand All @@ -12,6 +13,7 @@
if TYPE_CHECKING:
from google.auth.credentials import Credentials

from google.api_core.exceptions import NotFound
from google.auth.exceptions import DefaultCredentialsError
from google.cloud.storage import Client as StorageClient

Expand Down Expand Up @@ -135,6 +137,14 @@ def _is_file_or_dir(self, cloud_path: GSPath) -> Optional[str]:
return None

def _exists(self, cloud_path: GSPath) -> bool:
# short-circuit the root-level bucket
if not cloud_path.blob:
try:
next(self.client.bucket(cloud_path.bucket).list_blobs())
return True
except NotFound:
return False

return self._is_file_or_dir(cloud_path) in ["file", "dir"]

def _list_dir(self, cloud_path: GSPath, recursive=False) -> Iterable[Tuple[GSPath, bool]]:
Expand Down
72 changes: 47 additions & 25 deletions cloudpathlib/s3/s3client.py
Expand Up @@ -143,12 +143,19 @@ def _is_file_or_dir(self, cloud_path: S3Path) -> Optional[str]:
return "dir"

def _exists(self, cloud_path: S3Path) -> bool:
# check if this is a bucket
if not cloud_path.key:
try:
self.client.head_bucket(Bucket=cloud_path.bucket)
return True
except ClientError:
return False

return self._s3_file_query(cloud_path) is not None

def _s3_file_query(self, cloud_path: S3Path):
"""Boto3 query used for quick checks of existence and if path is file/dir"""
# first check if this is an object that we can access directly

# check if this is an object that we can access directly
try:
obj = self.s3.Object(cloud_path.bucket, cloud_path.key)
obj.load()
Expand All @@ -169,40 +176,55 @@ def _s3_file_query(self, cloud_path: S3Path):
)

def _list_dir(self, cloud_path: S3Path, recursive=False) -> Iterable[Tuple[S3Path, bool]]:
bucket = self.s3.Bucket(cloud_path.bucket)

prefix = cloud_path.key
if prefix and not prefix.endswith("/"):
prefix += "/"

yielded_dirs = set()

if recursive:
for o in bucket.objects.filter(Prefix=prefix):
# get directory from this path
for parent in PurePosixPath(o.key[len(prefix) :]).parents:
# if we haven't surfaced their directory already
if parent not in yielded_dirs and str(parent) != ".":
yield (self.CloudPath(f"s3://{cloud_path.bucket}/{prefix}{parent}"), True)
yielded_dirs.add(parent)
paginator = self.client.get_paginator("list_objects_v2")

yield (self.CloudPath(f"s3://{o.bucket_name}/{o.key}"), False)
else:
# non recursive is best done with old client API rather than resource
paginator = self.client.get_paginator("list_objects")

for result in paginator.paginate(
Bucket=cloud_path.bucket, Prefix=prefix, Delimiter="/"
):
# sub directory names
for result_prefix in result.get("CommonPrefixes", []):
for result in paginator.paginate(
Bucket=cloud_path.bucket, Prefix=prefix, Delimiter=("" if recursive else "/")
):
# yield everything in common prefixes as directories
for result_prefix in result.get("CommonPrefixes", []):
canonical = result_prefix.get("Prefix").rstrip("/") # keep a canonical form
if canonical not in yielded_dirs:
yield (
self.CloudPath(f"s3://{cloud_path.bucket}/{canonical}"),
True,
)
yielded_dirs.add(canonical)

# check all the keys
for result_key in result.get("Contents", []):
# yield all the parents of any key that have not been yielded already
o_relative_path = result_key.get("Key")[len(prefix) :]
for parent in PurePosixPath(o_relative_path).parents:
parent_canonical = prefix + str(parent).rstrip("/")
if parent_canonical not in yielded_dirs and str(parent) != ".":
yield (
self.CloudPath(f"s3://{cloud_path.bucket}/{parent_canonical}"),
True,
)
yielded_dirs.add(parent_canonical)

# if we already yielded this dir, go to next item in contents
canonical = result_key.get("Key").rstrip("/")
if canonical in yielded_dirs:
continue

# s3 fake directories have 0 size and end with "/"
if result_key.get("Key").endswith("/") and result_key.get("Size") == 0:
yield (
self.CloudPath(f"s3://{cloud_path.bucket}/{result_prefix.get('Prefix')}"),
self.CloudPath(f"s3://{cloud_path.bucket}/{canonical}"),
True,
)
yielded_dirs.add(canonical)

# files in the directory
for result_key in result.get("Contents", []):
# yield object as file
else:
yield (
self.CloudPath(f"s3://{cloud_path.bucket}/{result_key.get('Key')}"),
False,
Expand Down
5 changes: 5 additions & 0 deletions requirements-dev.txt
Expand Up @@ -5,6 +5,7 @@ flake8
ipytest
ipython
jupyter
loguru
matplotlib
mike
mkdocs>=1.2.2
Expand All @@ -19,8 +20,12 @@ pydantic
pytest
pytest-cases
pytest-cov
pytest-xdist
python-dotenv
pywin32; sys_platform == 'win32'
rich
shortuuid
tabulate
tqdm
typer
wheel
2 changes: 1 addition & 1 deletion setup.cfg
Expand Up @@ -21,7 +21,7 @@ ignore_errors = True

[tool:pytest]
testpaths = tests/
addopts = --cov=cloudpathlib --cov-report=term --cov-report=html --cov-report=xml
addopts = --cov=cloudpathlib --cov-report=term --cov-report=html --cov-report=xml -n=auto

[coverage:report]
include = cloudpathlib/**.py

0 comments on commit 19f314b

Please sign in to comment.