Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use list_objects_v2 API for S3 #302

Merged
merged 21 commits into from Dec 18, 2022
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
5 changes: 5 additions & 0 deletions .gitignore
Expand Up @@ -4,6 +4,9 @@ docs/docs/command-reference/*.md
docs/docs/index.md
docs/docs/changelog.md

# perf output
perf-results.csv

## GitHub Python .gitignore ##
# https://github.com/github/gitignore/blob/master/Python.gitignore

Expand Down Expand Up @@ -76,6 +79,7 @@ target/

# Jupyter Notebook
.ipynb_checkpoints
scratchpad.ipynb

# pyenv
.python-version
Expand All @@ -88,6 +92,7 @@ celerybeat-schedule

# dotenv
.env
.gscreds.json

# virtualenv
.venv
Expand Down
12 changes: 9 additions & 3 deletions Makefile
Expand Up @@ -51,7 +51,7 @@ docs: clean-docs docs-setup ## build the static version of the docs
docs-serve: clean-docs docs-setup ## serve documentation to livereload while you work
cd docs && mkdocs serve

format:
format: ## run blank for format codebase
pjbull marked this conversation as resolved.
Show resolved Hide resolved
black cloudpathlib tests docs

help:
Expand All @@ -60,7 +60,7 @@ help:
install: clean ## install the package to the active Python's site-packages
python setup.py install

lint: ## check style with flake8
lint: ## check style with black, flake8, and mypy
black --check cloudpathlib tests docs
flake8 cloudpathlib tests docs
mypy cloudpathlib
Expand All @@ -71,11 +71,17 @@ release: dist ## package and upload a release
release-test: dist
twine upload --repository pypitest dist/*

reqs:
reqs: ## install development requirements
pip install -U -r requirements-dev.txt

test: ## run tests with mocked cloud SDKs
python -m pytest -vv

test-debug: ## rerun tests that failed in last run and stop with pdb at failures
python -m pytest -n=0 -vv --lf --pdb

test-live-cloud: ## run tests on live cloud backends
USE_LIVE_CLOUD=1 python -m pytest -vv

perf: ## run performance measurement suite for s3 and save results to perf-results.csv
python tests/performance/cli.py s3 --save-csv=perf-results.csv
4 changes: 4 additions & 0 deletions cloudpathlib/azure/azblobclient.py
Expand Up @@ -142,6 +142,10 @@ def _is_file_or_dir(self, cloud_path: AzureBlobPath) -> Optional[str]:
return None

def _exists(self, cloud_path: AzureBlobPath) -> bool:
# short circuit when only the container
if not cloud_path.blob:
return self.service_client.get_container_client(cloud_path.container).exists()

return self._is_file_or_dir(cloud_path) in ["file", "dir"]

def _list_dir(
Expand Down
10 changes: 10 additions & 0 deletions cloudpathlib/gs/gsclient.py
Expand Up @@ -4,6 +4,8 @@
from pathlib import Path, PurePosixPath
from typing import Any, Callable, Dict, Iterable, Optional, TYPE_CHECKING, Tuple, Union

from google.api_core.exceptions import NotFound

from ..client import Client, register_client_class
from ..cloudpath import implementation_registry
from .gspath import GSPath
Expand Down Expand Up @@ -135,6 +137,14 @@ def _is_file_or_dir(self, cloud_path: GSPath) -> Optional[str]:
return None

def _exists(self, cloud_path: GSPath) -> bool:
# short-circuit the root-level bucket
if not cloud_path.blob:
try:
next(self.client.bucket(cloud_path.bucket).list_blobs())
return True
except NotFound:
return False

return self._is_file_or_dir(cloud_path) in ["file", "dir"]

def _list_dir(self, cloud_path: GSPath, recursive=False) -> Iterable[Tuple[GSPath, bool]]:
Expand Down
72 changes: 47 additions & 25 deletions cloudpathlib/s3/s3client.py
Expand Up @@ -143,12 +143,19 @@ def _is_file_or_dir(self, cloud_path: S3Path) -> Optional[str]:
return "dir"

def _exists(self, cloud_path: S3Path) -> bool:
# check if this is a bucket
if not cloud_path.key:
try:
self.client.head_bucket(Bucket=cloud_path.bucket)
return True
except ClientError:
return False

return self._s3_file_query(cloud_path) is not None

def _s3_file_query(self, cloud_path: S3Path):
"""Boto3 query used for quick checks of existence and if path is file/dir"""
# first check if this is an object that we can access directly

# check if this is an object that we can access directly
try:
obj = self.s3.Object(cloud_path.bucket, cloud_path.key)
obj.load()
Expand All @@ -169,40 +176,55 @@ def _s3_file_query(self, cloud_path: S3Path):
)

def _list_dir(self, cloud_path: S3Path, recursive=False) -> Iterable[Tuple[S3Path, bool]]:
bucket = self.s3.Bucket(cloud_path.bucket)

prefix = cloud_path.key
if prefix and not prefix.endswith("/"):
prefix += "/"

yielded_dirs = set()

if recursive:
for o in bucket.objects.filter(Prefix=prefix):
# get directory from this path
for parent in PurePosixPath(o.key[len(prefix) :]).parents:
# if we haven't surfaced their directory already
if parent not in yielded_dirs and str(parent) != ".":
yield (self.CloudPath(f"s3://{cloud_path.bucket}/{prefix}{parent}"), True)
yielded_dirs.add(parent)
paginator = self.client.get_paginator("list_objects_v2")

yield (self.CloudPath(f"s3://{o.bucket_name}/{o.key}"), False)
else:
# non recursive is best done with old client API rather than resource
paginator = self.client.get_paginator("list_objects")

for result in paginator.paginate(
Bucket=cloud_path.bucket, Prefix=prefix, Delimiter="/"
):
# sub directory names
for result_prefix in result.get("CommonPrefixes", []):
for result in paginator.paginate(
Bucket=cloud_path.bucket, Prefix=prefix, Delimiter=("" if recursive else "/")
):
# yield everything in common prefixes as directories
for result_prefix in result.get("CommonPrefixes", []):
canonical = result_prefix.get("Prefix").rstrip("/") # keep a canonical form
if canonical not in yielded_dirs:
yield (
self.CloudPath(f"s3://{cloud_path.bucket}/{canonical}"),
True,
)
yielded_dirs.add(canonical)

# check all the keys
for result_key in result.get("Contents", []):
# yield all the parents of any key that have not been yielded already
o_relative_path = result_key.get("Key")[len(prefix) :]
for parent in PurePosixPath(o_relative_path).parents:
parent_canonical = prefix + str(parent).rstrip("/")
if parent_canonical not in yielded_dirs and str(parent) != ".":
yield (
self.CloudPath(f"s3://{cloud_path.bucket}/{parent_canonical}"),
True,
)
yielded_dirs.add(parent_canonical)

# if we already yielded this dir, go to next item in contents
canonical = result_key.get("Key").rstrip("/")
if canonical in yielded_dirs:
continue

# s3 fake directories have 0 size and end with "/"
if result_key.get("Key").endswith("/") and result_key.get("Size") == 0:
yield (
self.CloudPath(f"s3://{cloud_path.bucket}/{result_prefix.get('Prefix')}"),
self.CloudPath(f"s3://{cloud_path.bucket}/{canonical}"),
True,
)
yielded_dirs.add(canonical)

# files in the directory
for result_key in result.get("Contents", []):
# yield object as file
else:
yield (
self.CloudPath(f"s3://{cloud_path.bucket}/{result_key.get('Key')}"),
False,
Expand Down
5 changes: 5 additions & 0 deletions requirements-dev.txt
Expand Up @@ -5,6 +5,7 @@ flake8
ipytest
ipython
jupyter
loguru
matplotlib
mike
mkdocs>=1.2.2
Expand All @@ -19,8 +20,12 @@ pydantic
pytest
pytest-cases
pytest-cov
pytest-xdist
python-dotenv
pywin32; sys_platform == 'win32'
rich
shortuuid
tabulate
tqdm
typer
wheel
2 changes: 1 addition & 1 deletion setup.cfg
Expand Up @@ -21,7 +21,7 @@ ignore_errors = True

[tool:pytest]
testpaths = tests/
addopts = --cov=cloudpathlib --cov-report=term --cov-report=html --cov-report=xml
addopts = --cov=cloudpathlib --cov-report=term --cov-report=html --cov-report=xml -n=auto

[coverage:report]
include = cloudpathlib/**.py
11 changes: 9 additions & 2 deletions tests/mock_clients/mock_azureblob.py
Expand Up @@ -33,7 +33,7 @@ def get_blob_client(self, container, blob):
return MockBlobClient(self.tmp_path, blob, service_client=self)

def get_container_client(self, container):
return MockContainerClient(self.tmp_path)
return MockContainerClient(self.tmp_path, container_name=container)

return MockBlobServiceClient

Expand Down Expand Up @@ -106,8 +106,15 @@ def content_as_bytes(self):


class MockContainerClient:
def __init__(self, root):
def __init__(self, root, container_name):
self.root = root
self.container_name = container_name

def exists(self):
if self.container_name == "container": # name used by passing tests
pjbull marked this conversation as resolved.
Show resolved Hide resolved
return True
else:
return False

def list_blobs(self, name_starts_with=None):
return mock_item_paged(self.root, name_starts_with)
Expand Down
16 changes: 13 additions & 3 deletions tests/mock_clients/mock_gs.py
Expand Up @@ -3,6 +3,8 @@
import shutil
from tempfile import TemporaryDirectory

from google.api_core.exceptions import NotFound

from .utils import delete_empty_parents_up_to_root

TEST_ASSETS = Path(__file__).parent.parent / "assets"
Expand Down Expand Up @@ -30,7 +32,7 @@ def __del__(self):
self.tmp.cleanup()

def bucket(self, bucket):
return MockBucket(self.tmp_path, client=self)
return MockBucket(self.tmp_path, bucket, client=self)

return MockClient

Expand Down Expand Up @@ -87,8 +89,9 @@ def content_type(self):


class MockBucket:
def __init__(self, name, client=None):
def __init__(self, name, bucket_name, client=None):
self.name = name
self.bucket_name = bucket_name
self.client = client

def blob(self, blob):
Expand All @@ -113,7 +116,14 @@ def list_blobs(self, max_results=None, prefix=None):
for f in path.glob("**/*")
if f.is_file() and not f.name.startswith(".")
]
return MockHTTPIterator(items, max_results)

# bucket name for passing tests
if self.bucket_name == "bucket":
return iter(MockHTTPIterator(items, max_results))
else:
raise NotFound(
f"Bucket {self.name} not expected as mock bucket; only 'bucket' exists."
)


class MockHTTPIterator:
Expand Down
36 changes: 34 additions & 2 deletions tests/mock_clients/mock_s3.py
Expand Up @@ -199,6 +199,19 @@ def __init__(self, root, session=None):
def get_paginator(self, api):
return MockBoto3Paginator(self.root, session=self.session)

def head_bucket(self, Bucket):
if Bucket == "bucket": # used in passing tests
return {"Bucket": Bucket}
else:
raise ClientError(
{
"Error": {
"Message": f"Bucket {Bucket} not expected as mock bucket; only 'bucket' exists."
}
},
{},
)

@property
def exceptions(self):
Ex = collections.namedtuple("Ex", "NoSuchKey")
Expand All @@ -214,14 +227,33 @@ def __init__(self, root, per_page=2, session=None):
def paginate(self, Bucket=None, Prefix="", Delimiter=None):
new_dir = self.root / Prefix

items = [f for f in new_dir.iterdir() if not f.name.startswith(".")]
if Delimiter == "/":
items = [f for f in new_dir.iterdir() if not f.name.startswith(".")]
else:
items = [f for f in new_dir.rglob("*") if not f.name.startswith(".")]

for ix in range(0, len(items), self.per_page):
page = items[ix : ix + self.per_page]
dirs = [
{"Prefix": str(_.relative_to(self.root).as_posix())} for _ in page if _.is_dir()
]
files = [
{"Key": str(_.relative_to(self.root).as_posix())} for _ in page if _.is_file()
{
"Key": str(_.relative_to(self.root).as_posix()),
"Size": 123
if not _.relative_to(self.root).exists()
else _.relative_to(self.root).stat().st_size,
}
for _ in page
if _.is_file()
]

# s3 can have "fake" directories where size is 0, but it is listed in "Contents" (see #198)
# add one in here for testing
if dirs:
fake_dir = dirs.pop(-1)
fake_dir["Size"] = 0
fake_dir["Key"] = fake_dir.pop("Prefix") + "/" # fake dirs have '/' appended
files.append(fake_dir)

yield {"CommonPrefixes": dirs, "Contents": files}
Empty file added tests/performance/__init__.py
Empty file.