Skip to content

Commit

Permalink
implement env var for local cache dir (#361)
Browse files Browse the repository at this point in the history
  • Loading branch information
pjbull committed Oct 9, 2023
1 parent 7a85710 commit 703607c
Show file tree
Hide file tree
Showing 8 changed files with 43 additions and 9 deletions.
1 change: 1 addition & 0 deletions HISTORY.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
- Add `with_stem` to all path types when python version supports it (>=3.9). ([Issue #287](https://github.com/drivendataorg/cloudpathlib/issues/287), [PR #290](https://github.com/drivendataorg/cloudpathlib/pull/290), thanks to [@Gilthans](https://github.com/Gilthans))
- Add `newline` parameter to the `write_text` method to align to `pathlib` functionality as of Python 3.10. [PR #362]https://github.com/drivendataorg/cloudpathlib/pull/362), thanks to [@pricemg](https://github.com/pricemg).
- Add support for Python 3.12 ([PR #364](https://github.com/drivendataorg/cloudpathlib/pull/364))
- Add `CLOUDPATHLIB_LOCAL_CACHE_DIR` env var for setting local_cache_dir default for clients ([Issue #352](https://github.com/drivendataorg/cloudpathlib/issues/352), [PR #357](https://github.com/drivendataorg/cloudpathlib/pull/357))

## v0.15.1 (2023-07-12)

Expand Down
3 changes: 2 additions & 1 deletion cloudpathlib/azure/azblobclient.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,8 @@ def __init__(
[the caching docs](https://cloudpathlib.drivendata.org/stable/caching/) for more information
about the options in cloudpathlib.eums.FileCacheMode.
local_cache_dir (Optional[Union[str, os.PathLike]]): Path to directory to use as cache
for downloaded files. If None, will use a temporary directory.
for downloaded files. If None, will use a temporary directory. Default can be set with
the `CLOUDPATHLIB_LOCAL_CACHE_DIR` environment variable.
content_type_method (Optional[Callable]): Function to call to guess media type (mimetype) when
writing a file to the cloud. Defaults to `mimetypes.guess_type`. Must return a tuple (content type, content encoding).
"""
Expand Down
7 changes: 7 additions & 0 deletions cloudpathlib/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,13 @@ def __init__(
if file_cache_mode is None:
file_cache_mode = FileCacheMode.from_environment()

if local_cache_dir is None:
local_cache_dir = os.environ.get("CLOUDPATHLIB_LOCAL_CACHE_DIR", None)

# treat empty string as None to avoid writing cache in cwd; set to "." for cwd
if local_cache_dir == "":
local_cache_dir = None

# explicitly passing a cache dir, so we set to persistent
# unless user explicitly passes a different file cache mode
if local_cache_dir and file_cache_mode is None:
Expand Down
3 changes: 2 additions & 1 deletion cloudpathlib/gs/gsclient.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,8 @@ def __init__(
[the caching docs](https://cloudpathlib.drivendata.org/stable/caching/) for more information
about the options in cloudpathlib.eums.FileCacheMode.
local_cache_dir (Optional[Union[str, os.PathLike]]): Path to directory to use as cache
for downloaded files. If None, will use a temporary directory.
for downloaded files. If None, will use a temporary directory. Default can be set with
the `CLOUDPATHLIB_LOCAL_CACHE_DIR` environment variable.
content_type_method (Optional[Callable]): Function to call to guess media type (mimetype) when
writing a file to the cloud. Defaults to `mimetypes.guess_type`. Must return a tuple (content type, content encoding).
"""
Expand Down
3 changes: 2 additions & 1 deletion cloudpathlib/s3/s3client.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,8 @@ def __init__(
[the caching docs](https://cloudpathlib.drivendata.org/stable/caching/) for more information
about the options in cloudpathlib.eums.FileCacheMode.
local_cache_dir (Optional[Union[str, os.PathLike]]): Path to directory to use as cache
for downloaded files. If None, will use a temporary directory.
for downloaded files. If None, will use a temporary directory. Default can be set with
the `CLOUDPATHLIB_LOCAL_CACHE_DIR` environment variable.
endpoint_url (Optional[str]): S3 server endpoint URL to use for the constructed boto3 S3 resource and client.
Parameterize it to access a customly deployed S3-compatible object store such as MinIO, Ceph or any other.
boto3_transfer_config (Optional[dict]): Instantiated TransferConfig for managing
Expand Down
10 changes: 5 additions & 5 deletions docs/docs/caching.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion docs/docs/script/caching.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@
#
# However, sometimes I don't want to have to re-download files I know won't change. For example, in the LADI dataset, I may want to use the images in a Jupyter notebook and every time I restart the notebook I want to always have the downloaded files. I don't want to ever re-download since I know the LADI images won't be changing on S3. I want these to be there, even if I restart my whole machine.
#
# We can do this just by using a `Client` that does all the downloading/uploading to a specfic folder on our local machine.
# We can do this just by using a `Client` that does all the downloading/uploading to a specfic folder on our local machine. We set the cache folder by passing `local_cache_dir` to the `Client` when instantiating. You can also set a default for all clients by setting the `CLOUDPATHLIB_LOCAL_CACHE_DIR` to a path. (This is only recommended with (1) an absolute path, so you know where the cache is no matter where your code is running, and (2) if you only use the default client for one cloud provider and don't instantiate multiple. In this case, the clients will use the same cache dir and could overwrite each other's content. Setting `CLOUDPATHLIB_LOCAL_CACHE_DIR` to an empty string will be treated as it not being set.)

from cloudpathlib import S3Client

Expand Down
23 changes: 23 additions & 0 deletions tests/test_caching.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
from time import sleep
from pathlib import Path

import pytest

Expand Down Expand Up @@ -283,6 +284,28 @@ def test_environment_variable_instantiation(rig: CloudProviderTestRig, tmpdir):
os.environ["CLOUPATHLIB_FILE_CACHE_MODE"] = original_env_setting


def test_environment_variable_local_cache_dir(rig: CloudProviderTestRig, tmpdir):
# environment instantiation
original_env_setting = os.environ.get("CLOUDPATHLIB_LOCAL_CACHE_DIR", "")

try:
os.environ["CLOUDPATHLIB_LOCAL_CACHE_DIR"] = tmpdir.strpath
client = rig.client_class(**rig.required_client_kwargs)
assert client._local_cache_dir == Path(tmpdir.strpath)

cp = rig.create_cloud_path("dir_0/file0_0.txt", client=client)
cp.fspath # download from cloud into the cache
assert (tmpdir / cp._no_prefix).exists()

# "" treated as None; falls back to temp dir for cache
os.environ["CLOUDPATHLIB_LOCAL_CACHE_DIR"] = ""
client = rig.client_class(**rig.required_client_kwargs)
assert client._cache_tmp_dir is not None

finally:
os.environ["CLOUDPATHLIB_LOCAL_CACHE_DIR"] = original_env_setting


def test_manual_cache_clearing(rig: CloudProviderTestRig):
# use client that we can delete rather than default
client = rig.client_class(**rig.required_client_kwargs)
Expand Down

0 comments on commit 703607c

Please sign in to comment.