diff --git a/.travis.yml b/.travis.yml index 028875447..5f9fced1c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -55,7 +55,7 @@ jobs: # check formatting - stage: formatting python: "3.7" - script: black --check podpac + script: black --check --diff podpac # deploy docs to `podpac-docs` repository. This script only pushes the docs on pushes to develop and master. - stage: docs deploy python: "3.7" diff --git a/CHANGELOG.md b/CHANGELOG.md index ed5469b59..c8bd8fddf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # Changelog +## 2.0.0 + +# Breaking changes +* Renamed 'native_coordinates' to 'coordinates' ## 1.3.0 diff --git a/dist/aws/Dockerfile b/dist/aws/Dockerfile index 462bcd93b..aba75cf9d 100644 --- a/dist/aws/Dockerfile +++ b/dist/aws/Dockerfile @@ -1,40 +1,59 @@ FROM amazonlinux:latest -ARG COMMIT_SHA="" -ARG TAG="" -RUN echo $COMMIT_SHA +ARG REF="master" -RUN yum update -y - -# Install apt dependencies -RUN yum install -y gcc gcc-c++ freetype-devel yum-utils findutils openssl-devel - -RUN yum -y groupinstall development +# development tools +RUN yum update -y && yum -y install \ + groupinstall \ + development \ + gcc \ + gcc-c++ \ + git \ + zip \ + freetype-devel \ + yum-utils \ + findutils \ + openssl-devel \ + && yum clean all # Mock current AWS Lambda docker image -# Find complete list of package https://gist.github.com/vincentsarago/acb33eb9f0502fcd38e0feadfe098eb7 -RUN yum install -y libjpeg-devel libpng-devel libcurl-devel ImageMagick-devel.x86_64 python3-devel.x86_64 which +# NOTE: this is still Py3.7, need to be careful about version management +RUN yum -y install \ + python3 \ + python3-pip \ + python3-devel \ + && yum clean all -ENV LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH +# clone the podpac repository and checkout the requested tag +# for developers looking to create a custom deployment package or dependencies, +# comment this block and un-comment the next block +RUN git clone https://github.com/creare-com/podpac.git /podpac/ &&\ + pushd /podpac/ && \ + git fetch --all && \ + git checkout $REF && \ + popd -ADD . /podpac/ +# # uncomment this block to create a custom deployment package or dependencies archive +# # based on your local copy of the PODPAC repository +# # this command assumes you are building the Dockerfile using `build_lambda.sh` (which runs from the root of the PODPAC repository ) +# ADD . /podpac/ +# Install core, datatype and aws optional dependencies RUN mkdir /tmp/vendored/ && \ - cp /podpac/settings.json /tmp/vendored/settings.json && \ cd /podpac/ && rm -rf .git/ doc/ .github/ && \ - pip3 install -r dist/aws/aws_requirements.txt -t /tmp/vendored/ --upgrade + pip3 install . -t /tmp/vendored/ --upgrade && \ + pip3 install .[datatype] -t /tmp/vendored/ --upgrade && \ + pip3 install .[aws] -t /tmp/vendored/ --upgrade +# need to add some __init__ files RUN cd /tmp/vendored/ && touch pydap/__init__.py && \ touch pydap/responses/__init__.py && \ touch pydap/handlers/__init__.py && \ touch pydap/parsers/__init__.py -RUN cp -r /podpac/ /tmp/vendored/ && \ - mv /tmp/vendored/podpac/dist/aws/handler.py /tmp/vendored/handler.py && \ - cp tmp/vendored/podpac/dist/aws/_mk_dist.py /tmp/vendored/_mk_dist.py && \ - rm -rf /tmp/vendored/podpac/dist/ && \ - cp -r /tmp/vendored/podpac/podpac/* /tmp/vendored/podpac/ && \ - rm -rf /tmp/vendored/podpac/podpac/* +# copy handler and _mk_dist: +RUN cp /podpac/dist/aws/handler.py /tmp/vendored/handler.py && \ + cp /podpac/dist/aws/_mk_dist.py /tmp/vendored/_mk_dist.py RUN cd /tmp/vendored && \ find * -maxdepth 0 -type f | grep ".zip" -v | grep -v ".pyc" | xargs zip -9 -rqy podpac_dist.zip diff --git a/dist/aws/README.md b/dist/aws/README.md index d91d6eaf6..5c74d6d42 100644 --- a/dist/aws/README.md +++ b/dist/aws/README.md @@ -17,19 +17,6 @@ The bucket itself is private, but each directory is made public individually. The following process is used to create new PODPAC distribution in the `podpac-dist` bucket when a new version of PODPAC is released. -- Run `build_lambda.sh`. Note this currently requires `settings.json` to copied to the root of the podpac directory. +- Run `build_lambda.sh` - Run `upload_lambda.sh` - Navigate to `podpac-dist` (or input bucket) and make the archives public - -## Handler - -...document handler.py... - -## Using Distribution - -...document podpac build process... - -## Debugging Lambda Function - -Use the script `print_logs.sh` to read cloud watch logs from your built lambda function. -This is currently the only way to debug your lambda function. diff --git a/dist/aws/aws_requirements.txt b/dist/aws/aws_requirements.txt deleted file mode 100644 index 95fd4ca7e..000000000 --- a/dist/aws/aws_requirements.txt +++ /dev/null @@ -1,19 +0,0 @@ -matplotlib>=2.1 -numpy>=1.14 -pint>=0.8 -scipy>=1.0 -traitlets>=4.3 -xarray>=0.10 -requests>=2.18 -beautifulsoup4>=4.6 -h5py>=2.9 -lxml>=4.2 -pydap>=3.2 -rasterio>=0.36 -pyproj>=2.4 -requests>=2.18 -numexpr>=2.6 -lazy-import>=0.2.2 -psutil -zarr>=2.3 -s3fs>=0.2 diff --git a/dist/aws/build_lambda.sh b/dist/aws/build_lambda.sh index a28030125..8fbf5ad8e 100644 --- a/dist/aws/build_lambda.sh +++ b/dist/aws/build_lambda.sh @@ -1,39 +1,35 @@ #!/bin/sh # -# Build podpac lambda distribution and dependencies -# -# Currently, this builds the function using the local -# podpac repository, including any outstanding changes. +# Build podpac lambda distribution and dependencies. +# Change $REF to specify a specific branch, tag, or commit in podpac to build from. # # Usage: # -# $ bash build_lambda.sh [s3-bucket] [function-name] +# $ bash build_lambda.sh # # Requires: # - Docker -# - `settings.json` to be copied to the root directory of the podpac repository -# This will not be required in the future -# -# Example usage: -# -# $ bash build_lambda.sh - # variables -COMMIT_SHA="$(git rev-parse HEAD)" -TAG="$(git describe --always)" +REF="master" +# REF="tags/1.1.0" # Change $REF to the branch, tag, or commit in podpac you want to use +# REF="develop" + DOCKER_NAME="podpac" -DOCKER_TAG=$TAG +DOCKER_TAG=$REF -echo "Creating docker image from podpac version ${TAG}" +echo "Creating docker image from podpac version ${REF}" echo "${DOCKER_NAME}:${DOCKER_TAG}" # Navigate to root, build docker, and extract zips pushd ../../ -docker build -f dist/aws/Dockerfile --no-cache --tag $DOCKER_NAME:$DOCKER_TAG --build-arg COMMIT_SHA="${COMMIT_SHA}" --build-arg TAG="${TAG}" . +docker build -f dist/aws/Dockerfile --no-cache --tag $DOCKER_NAME:$DOCKER_TAG --build-arg REF="${REF}" . docker run --name "${DOCKER_NAME}" -itd $DOCKER_NAME:$DOCKER_TAG docker cp "${DOCKER_NAME}":/tmp/vendored/podpac_dist.zip ./dist/aws docker cp "${DOCKER_NAME}":/tmp/vendored/podpac_deps.zip ./dist/aws docker stop "${DOCKER_NAME}" docker rm "${DOCKER_NAME}" popd + +echo "Built podpac deployment package: podpac_dist.zip" +echo "Built podpac dependencies: podpac_deps.zip" diff --git a/dist/aws/configure_lambda.sh b/dist/aws/configure_lambda.sh deleted file mode 100644 index 2b316298f..000000000 --- a/dist/aws/configure_lambda.sh +++ /dev/null @@ -1,78 +0,0 @@ -#!/bin/sh -# -# Configure AWS for podpac lambda function -# -# Usage: -# -# $ bash configure_lambda.sh [s3-bucket] [function-name] -# -# Requires: -# - AWS CLI: https://docs.aws.amazon.com/cli/ -# - AWS credentials must be configured using the `aws` cli. -# See https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html#cli-quick-configuration -# - Dist and Dependencies uploaded using `upload_lambda.sh` -# - Function must be created from the AWS Dashboard -# - API Gateway must be created from the AWS Dashboard -# - Note down the `rest-api-id` and `resource-id` in parentheses in the top bar of the API gatway dashboard -# - You must Select the top level resource '/' and select "Create Method" -# Example usage: -# -# $ bash configure_lambda.sh podpac-s3 podpac_lambda h827as06ji 1ya7h6 -# - -# TODO: remove this in the future when function generation/update is automated elsewhere - -BUCKET=$1 -FUNCTION=$2 -API_ID=$3 -API_RESOURCE_ID=$4 -TAG="$(git describe --always)" - -if [ -z "$BUCKET" ] - then - echo "S3 bucket name required as first cli argument" - exit 1 - else - echo "Bucket: ${BUCKET}" -fi - -if [ -z "$FUNCTION" ] - then - echo "Function name required as second cli argument" - exit 1 - else - echo "Function: ${FUNCTION}" -fi - -if [ -z "$API_ID" ] - then - echo "Rest API ID required as third cli argument" - exit 1 - else - echo "REST API ID: ${API_ID}" -fi - -if [ -z "$API_RESOURCE_ID" ] - then - echo "API Resource ID required as fourth cli argument" - exit 1 - else - echo "API Resource ID: ${API_RESOURCE_ID}" -fi - -# Update lambda function to use the zips from S3 (uploaded above) -# aws lambda update-function-code --function-name $FUNCTION --s3-bucket $BUCKET --s3-key podpac/podpac_dist_$TAG.zip -# aws lambda update-function-configuration --function-name $FUNCTION --handler handler.handler --timeout 300 --memory-size 2048 -# aws apigateway update-rest-api --rest-api-id $API_ID --patch-operations "op=replace,path=/binaryMediaTypes/*~1*,value='*/*'" -RESOURCE=$(aws apigateway create-resource --rest-api-id $API_ID --parent-id $API_RESOURCE_ID --path-part 'lambda' --output text) -RESOURCE_ID=$(echo "$(echo $RESOURCE | cut -d " " -f1)") -aws apigateway put-method --rest-api-id $API_ID --resource-id $RESOURCE_ID --http-method ANY --authorization-type NONE - -echo "Log in to AWS and perform the following steps:" -echo "1. Navigate to your API in the API Gateway and select the resource /lambda HTTP Method (ANY)." -echo "2. Select Integration Request -> Lambda Function, Check Use Lambda Proxy Integration, Select your lambda function region and function name." -echo "3. Press the Actions dropdown and select Deploy API. Select [New Stage] and create a stage name (doesn't matter exact name)" -echo "4. Navigate to your lambda function console and confirm you see API Gateway as a trigger." - -# LAMBDA_URI=`$(aws lambda) -# aws apigateway put-integration --rest-api-id $API_ID --resource-id $API_RESOURCE_ID --http-method ANY --type AWS --integration-http-method POST --uri diff --git a/dist/aws/example.json b/dist/aws/example.json deleted file mode 100644 index 90d6d269c..000000000 --- a/dist/aws/example.json +++ /dev/null @@ -1,18 +0,0 @@ -{ - "pipeline": { - "SinCoords": { - "node": "core.algorithm.algorithm.SinCoords" - } - }, - "output": { - "format": "png", - "vmin": -1.2, - "vmax": 1.2, - }, - "coordinates": { - "TIME": "2018-03-03", - "BBOX": "-180.0,-90.0,180.0,90.0", - "WIDTH": 256, - "HEIGHT": 256 - } -} diff --git a/dist/aws/handler.py b/dist/aws/handler.py index adaae46d0..5e5055ffe 100644 --- a/dist/aws/handler.py +++ b/dist/aws/handler.py @@ -44,6 +44,10 @@ def default_pipeline(pipeline=None): else: pipeline = defaults + # overwrite certain settings so that the function doesn't fail + pipeline["settings"]["ROOT_PATH"] = "/tmp" + pipeline["settings"]["LOG_FILE_PATH"] = "/tmp" + return pipeline @@ -82,7 +86,7 @@ def parse_event(trigger, event): """ if trigger == "eval": - print("Triggered by Invoke") + print ("Triggered by Invoke") # event is the pipeline, provide consistent pipeline defaults pipeline = default_pipeline(event) @@ -90,7 +94,7 @@ def parse_event(trigger, event): return pipeline elif trigger == "S3": - print("Triggered from S3") + print ("Triggered from S3") # get boto s3 client s3 = boto3.client("s3") @@ -129,7 +133,7 @@ def parse_event(trigger, event): return pipeline elif trigger == "APIGateway": - print("Triggered from API Gateway") + print ("Triggered from API Gateway") pipeline = default_pipeline() pipeline["url"] = event["queryStringParameters"] @@ -154,8 +158,8 @@ def parse_event(trigger, event): # If we get here, the api settings were loaded pipeline["settings"] = {**pipeline["settings"], **api_settings} except Exception as e: - print("Got an exception when attempting to load api settings: ", e) - print(pipeline) + print ("Got an exception when attempting to load api settings: ", e) + print (pipeline) # handle OUTPUT in query parameters elif param == "output": @@ -195,7 +199,7 @@ def handler(event, context): ret_pipeline : bool, optional Description """ - print(event) + print (event) # Add /tmp/ path to handle python path for dependencies sys.path.append("/tmp/") @@ -227,13 +231,22 @@ def handler(event, context): os.environ.get("PODPAC_VERSION", pipeline["settings"].get("PODPAC_VERSION")) ) # this should be equivalent to version.semver() - # Download dependencies from specific bucket/object - s3 = boto3.client("s3") - s3.download_file(bucket, dependencies, "/tmp/" + dependencies) - subprocess.call(["unzip", "/tmp/" + dependencies, "-d", "/tmp"]) - sys.path.append("/tmp/") - subprocess.call(["rm", "/tmp/" + dependencies]) - # ----- + # Check to see if this function is "hot", in which case the dependencies have already been downloaded and are + # available for use right away. + if os.path.exists("/tmp/scipy"): + print ( + "Scipy has been detected in the /tmp/ directory. Assuming this function is hot, dependencies will" + " not be downloaded." + ) + else: + # Download dependencies from specific bucket/object + print ("Downloading and extracting dependencies") + s3 = boto3.client("s3") + s3.download_file(bucket, dependencies, "/tmp/" + dependencies) + subprocess.call(["unzip", "/tmp/" + dependencies, "-d", "/tmp"]) + sys.path.append("/tmp/") + subprocess.call(["rm", "/tmp/" + dependencies]) + # ----- # Load PODPAC @@ -285,7 +298,7 @@ def handler(event, context): try: json.dumps(body) except Exception as e: - print("Output body is not serializable, attempting to decode.") + print ("Output body is not serializable, attempting to decode.") body = body.decode() return { diff --git a/dist/aws/upload_lambda.sh b/dist/aws/upload_lambda.sh index 471a637af..5a399ec12 100644 --- a/dist/aws/upload_lambda.sh +++ b/dist/aws/upload_lambda.sh @@ -1,36 +1,28 @@ #!/bin/sh # # Upload podpac lambda distribution and dependencies -# -# Currently, this uploads the zip archives and updates -# the specific lambda function +# Change $BUCKET or $DIR to control the S3 Bucket and Bucket path +# where zip archives are uploaded. # # Usage: # -# $ bash upload_lambda.sh [s3-bucket] +# $ bash upload_lambda.sh # # Requires: # - AWS CLI: https://docs.aws.amazon.com/cli/ # - AWS credentials must be configured using the `aws` cli. # See https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html#cli-quick-configuration -# -# Example usage: -# -# $ bash upload_lambda.sh -BUCKET=podpac-dist -TAG="$(git describe --always)" -if [ ! -z "$1" ] - then - BUCKET=$1 -fi +BUCKET="podpac-dist" +DIR="dev" +# DIR="1.3.0" # for releases, upload to release path by semantic version -echo "Uploading podpac distribution to bucket: ${BUCKET}" +AWSPATH="s3://$BUCKET/$DIR" +echo "Uploading podpac distribution to S3 path: ${AWSPATH}" # Upload zips to S3 -aws s3 cp podpac_deps.zip s3://$BUCKET/$TAG/podpac_deps.zip -aws s3 cp podpac_dist.zip s3://$BUCKET/$TAG/podpac_dist.zip -# rm podpac_deps.zip podpac_dist.zip +aws s3 cp podpac_deps.zip $AWSPATH/podpac_deps.zip +aws s3 cp podpac_dist.zip $AWSPATH/podpac_dist.zip echo "Navigate to your bucket $BUCKET, select the zip archives you just uploaded and make them public" diff --git a/doc/source/api.rst b/doc/source/api.rst index 34e522af3..0373da7f0 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -108,29 +108,6 @@ Classes to manage interpolation podpac.interpolators.ScipyPoint -Pipelines ---------- - -.. autosummary:: - :toctree: api/ - :template: class.rst - - podpac.pipeline.Pipeline - podpac.pipeline.PipelineError - -.. rubric:: Pipeline Outputs - -.. autosummary:: - :toctree: api/ - :template: class.rst - - podpac.pipeline.Output - podpac.pipeline.NoOutput - podpac.pipeline.FileOutput - podpac.pipeline.FTPOutput - podpac.pipeline.S3Output - podpac.pipeline.ImageOutput - Algorithm Nodes --------------- @@ -200,8 +177,9 @@ Stitch multiple data sources together :toctree: api/ :template: class.rst - podpac.compositor.Compositor podpac.compositor.OrderedCompositor + podpac.compositor.UniformTileCompositor + podpac.compositor.UniformTileMixin Datalib @@ -243,8 +221,8 @@ Utilities :toctree: api/ :template: class.rst - podpac.authentication.SessionWithHeaderRedirection - podpac.authentication.EarthDataSession + podpac.authentication.RequestsSessionMixin + podpac.authentication.S3Mixin .. rubric:: Settings @@ -264,6 +242,19 @@ Utilities podpac.utils.create_logfile podpac.utils.clear_cache + podpac.utils.cached_property + podpac.utils.NoCacheMixin + podpac.utils.DiskCacheMixin + podpac.utils.NodeTrait + + +.. rubric:: Style + +.. autosummary:: + :toctree: api/ + :template: module.rst + + podpac.style.Style .. rubric:: Version diff --git a/doc/source/aws-development.md b/doc/source/aws-development.md index 034e7602b..4db7e16f7 100644 --- a/doc/source/aws-development.md +++ b/doc/source/aws-development.md @@ -1,27 +1,80 @@ # AWS Development -This document describes more details on using the AWS functionality of PODPAC. +This document provides details on how PODPAC integrates with AWS services for serverless cloud processing. +See the [AWS Quick Start Guide](aws.html) for a quick guide to building and using AWS services with PODPAC. ## AWS Architecture -All files related to creating a Lambda function are in `dist/aws`. The `DockerFile` is based on Amazon's EC2 DockerHub distribution, and creates a Podpac-friendly python 3.6 environment. A `.zip` file is extracted from this environment, which can be used to create a Lambda function. Conveniently, developers can also use this to create an EC2 instance, or work directly in the Docker container. +## Creating PODPAC resources for AWS -Our `handler.py` expects the Lambda event to include a pipeline definition in the form of (URI encoded) JSON. The handler then executes that pipeline accordingly. However, developers are encouraged to write their own handlers as needed. +> [Docker](https://www.docker.com/) is required for creating PODPAC resources for AWS services. -## Creating Your Own Podpac Lambda Function +All files related to creating PODPAC resources for AWS live in [`dist/aws`](https://github.com/creare-com/podpac/tree/master/dist/aws). -We're now set up to create an AWS Lambda function "out of the box". Assuming you've installed Docker, here are the steps to create a Lambda function: +- `handler.py`: [AWS Lambda function handler](https://docs.aws.amazon.com/lambda/latest/dg/python-programming-model-handler-types.html). Handles PODPAC Lambda trigger event, executes the pipeline, and returns the result back to the source of the trigger. Developers can override the default function handler for a [`Lambda`](api/podpac.managers.Lambda.html) Node using the [`function_handler`](api/podpac.managers.Lambda.html#podpac.managers.Lambda.function_handler) attribute. +- `DockerFile`: Docker instructions for creating PODPAC deployment package and dependencies using [Amazon's EC2 DockerHub](https://hub.docker.com/_/amazonlinux/) distribution. +- `build_lambda.sh`: Bash script to build PODPAC deployment package and dependencies using [Docker](https://www.docker.com/). Outputs `podpac_dist.zip` and `podpac_deps.zip` in the `dist/aws` directory. + - `podpac_dist.zip`: PODPAC [deployment package](https://docs.aws.amazon.com/lambda/latest/dg/gettingstarted-features.html#gettingstarted-features-package) ready to create a Lambda function. + - `podpac_deps.zip`: PODPAC dependencies that are hosted on S3 and dynamically extracted during Lambda function execution. These files are seperate from `podpac_dist.zip` to circumvent the space limitations of AWS Lambda functions. +- `upload_lambda.sh`: Convience script to upload deployment package and dependencies to an S3 bucket. -- Run `docker build -f DockerFile --tag $NAME:$TAG .` from the `dist/aws` directory -- Create a Lambda using the resulting `podpac:latest/tmp/package.zip` - - For example, we've chosen to do this as follows: - - ```bash - docker run --name lambda -itd $NAME:$TAG - docker cp lambda:/tmp/package.zip package.zip - docker stop lambda - docker rm lambda - ``` - - Upload package.zip (~67 MB) to S3. - - Create a Lambda function from the AWS developer console - - Copy the link address of package.zip from its S3 bucket, paste into the Lambda's "Function code" field - - Set up any other Lambda properties you'd like. We use S3 triggers - the handler is triggered when pipeline JSON is uploaded to our S3 bucket +To create a custom deployment package or dependencies package: + +- Edit the `Dockerfile` or `handler.py` with desired changes + - To build using the local copy of the PODPAC repository,see comment in `Dockerfile` at ~L36. +- Build the deployment package and dependencies + +```bash +$ bash build_lambda.sh +Creating docker image from podpac version master +podpac:master +... +Built podpac deployment package: podpac_dist.zip +Built podpac dependencies: podpac_deps.zip +``` + +- You can now use PODPAC to create a function from these local resources: + +```python +import podpac + +# configure settings +settings["AWS_ACCESS_KEY_ID"] = "access key id" +settings["AWS_SECRET_ACCESS_KEY"] = "secrect access key" +settings["AWS_REGION_NAME"] = "region name" +settings["S3_BUCKET_NAME"] = "bucket name" +settings["FUNCTION_NAME"] = "function name" +settings["FUNCTION_ROLE_NAME"] = "role name" + +# define node +node = podpac.managers.aws.Lambda(function_source_dist_zip="dist/aws/podpac_dist.zip", + function_source_dependencies_zip="dist/aws/podpac_deps.zip" + ) + +# build AWS resources +node.build() +``` + +- You can also upload `podpac_dist.zip` and `podpac_deps.zip` to a public or user-accessible S3 bucket and build PODPAC functions from the remote bucket. The bash script `upload_lambda.sh` can do this for you if the `BUCKET` variable is customized. +We'll assume you copy the files to `s3://my-bucket/directory/podpac_dist.zip` and `s3://my-bucket/directory/podpac_deps.zip`: + +```python +import podpac + +# configure settings +settings["AWS_ACCESS_KEY_ID"] = "access key id" +settings["AWS_SECRET_ACCESS_KEY"] = "secrect access key" +settings["AWS_REGION_NAME"] = "region name" +settings["S3_BUCKET_NAME"] = "bucket name" +settings["FUNCTION_NAME"] = "function name" +settings["FUNCTION_ROLE_NAME"] = "role name" + +# define node +node = podpac.managers.aws.Lambda(function_source_bucket="my-bucket", + function_source_dist_key="directory/podpac_dist.zip", + function_source_dependencies_key="directory/podpac_deps.zip" + ) + +# build AWS resources +node.build() +``` diff --git a/doc/source/cache.md b/doc/source/cache.md index b9dcf2fd3..a9977ef70 100644 --- a/doc/source/cache.md +++ b/doc/source/cache.md @@ -55,16 +55,18 @@ smap = podpac.datalib.smap.SMAP(cache_output=True) Different instances of the same node share a cache. For example: ```python ->>> coords = podpac.Coordinates([podpac.clinspace(40, 39, 16), +[.] import podpac +[.] import podpac.datalib +[.] coords = podpac.Coordinates([podpac.clinspace(40, 39, 16), podpac.clinspace(-100, -90, 16), '2015-01-01T00', ['lat', 'lon', 'time']]) ->>> smap1 = podpac.datalib.smap.SMAP() ->>> o = smap1.eval(coords) ->>> smap1._from_cache +[.] smap1 = podpac.datalib.smap.SMAP() +[.] o = smap1.eval(coords) +[.] smap1._from_cache False ->>> del smap1 ->>> smap2 = podpac.datalib.smap.SMAP() ->>> o = smap2.eval(coords) ->>> smap2._from_cache +[.] del smap1 +[.] smap2 = podpac.datalib.smap.SMAP() +[.] o = smap2.eval(coords) +[.] smap2._from_cache True ``` diff --git a/doc/source/conf.py b/doc/source/conf.py index 081d05495..d5d479b72 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -227,7 +227,7 @@ def generate_example_links(): prestring = "- " string = "\n".join( [ - prestring + " `{} <{}>`__".format(f, base_link + "/" + f) + prestring + " `{} <{}>`_".format(f.split('.ipynb')[0].replace('-', ' ').capitalize(), base_link + "/" + f) for f in files if f.endswith("ipynb") ] @@ -237,7 +237,7 @@ def generate_example_links(): f for f in files if os.path.isdir(os.path.join(nbpath, f)) - and f not in [".ipynb_checkpoints", "Images", "old_examples", "presentations"] + and f not in [".ipynb_checkpoints", "developer", "__pycache__"] ] subdirs.sort() for sd in subdirs: @@ -245,11 +245,11 @@ def generate_example_links(): link = base_link + "/" + sd fs = os.listdir(path) fs.sort() - string += "\n- {}\n".format(sd) + string += "\n- {}\n".format(sd.replace('-', ' ').title()) prestring = " -" string += "\n".join( [ - prestring + " `{} <{}>`__".format(f, link + "/" + f) + prestring + " `{} <{}>`_".format(f.split('.ipynb')[0].replace('-', ' ').capitalize(), link + "/" + f) for f in fs if f.endswith("ipynb") ] diff --git a/doc/source/coordinates.md b/doc/source/coordinates.md index 92748bbff..d4c9ee262 100644 --- a/doc/source/coordinates.md +++ b/doc/source/coordinates.md @@ -26,14 +26,15 @@ are `'lat'`, `'lon'`, `'time'`, and `'alt'`. Unstacked multidimensional coordinates form a grid of points. For example, the following Coordinates contain three dimensions and a total of 24 points. ``` ->>> lat = [0, 1, 2] ->>> lon = [10, 20, 30, 40] ->>> time = ['2018-01-01', '2018-01-02'] ->>> Coordinates([lat, lon], dims=['lat', 'lon']) +[.] from podpac import Coordinates +[.] lat = [0, 1, 2] +[.] lon = [10, 20, 30, 40] +[.] time = ['2018-01-01', '2018-01-02'] +[.] Coordinates([lat, lon], dims=['lat', 'lon']) Coordinates lat: ArrayCoordinates1d(lat): Bounds[0.0, 2.0], N[3], ctype['midpoint'] lon: ArrayCoordinates1d(lon): Bounds[10.0, 40.0], N[4], ctype['midpoint'] ->>> Coordinates([lat, lon, time], dims=['lat', 'lon', 'time']) +[.] Coordinates([lat, lon, time], dims=['lat', 'lon', 'time']) Coordinates lat: ArrayCoordinates1d(lat): Bounds[0.0, 2.0], N[3], ctype['midpoint'] lon: ArrayCoordinates1d(lon): Bounds[10.0, 40.0], N[4], ctype['midpoint'] @@ -43,6 +44,7 @@ Coordinates You can also create coordinates with just one dimension the same way: ``` +>>> from podpac import Coordinates >>> Coordinates([time], dims=['time']) Coordinates time: ArrayCoordinates1d(time): Bounds[2018-01-01, 2018-01-02], N[2], ctype['midpoint'] @@ -57,31 +59,33 @@ that the name for this stacked dimension is 'lat_lon', using an underscore to co The following example has a single stacked dimension and a total of 3 points. ``` ->>> lat = [0, 1, 2] ->>> lon = [10, 20, 30] ->>> c = Coordinates([[lat, lon]], dims=['lat_lon']) ->>> c +[.] from podpac import Coordinates +[.] lat = [0, 1, 2] +[.] lon = [10, 20, 30] +[.] c = Coordinates([[lat, lon]], dims=['lat_lon']) +[.] c Coordinates lat_lon[lat]: ArrayCoordinates1d(lat): Bounds[0.0, 2.0], N[3], ctype['midpoint'] lat_lon[lon]: ArrayCoordinates1d(lon): Bounds[10.0, 30.0], N[3], ctype['midpoint'] ->>> c['lat_lon'].coordinates[0] +[.] c['lat_lon'].coordinates[0] (0.0, 10.0) ``` Coordinates can combine stacked dimensions and unstacked dimensions. For example, in the following Coordinates the `(lat, lon)` values and the `time` values form a grid of 6 total points. ``` ->>> lat = [0, 1, 2] ->>> lon = [10, 20, 30] ->>> time = ['2018-01-01', '2018-01-02'] ->>> c = Coordinates([[lat, lon], time], dims=['lat_lon', 'time']) +[.] from podpac import Coordinates +[.] lat = [0, 1, 2] +[.] lon = [10, 20, 30] +[.] time = ['2018-01-01', '2018-01-02'] +[.] c = Coordinates([[lat, lon], time], dims=['lat_lon', 'time']) Coordinates lat_lon[lat]: ArrayCoordinates1d(lat): Bounds[0.0, 2.0], N[3], ctype['midpoint'] lat_lon[lon]: ArrayCoordinates1d(lon): Bounds[10.0, 30.0], N[3], ctype['midpoint'] time: ArrayCoordinates1d(time): Bounds[2018-01-01, 2018-01-02], N[2], ctype['midpoint'] ->>> c['lat_lon'].coordinates[0] +[.] c['lat_lon'].coordinates[0] (0.0, 10.0) ->>> c['time'].coordinates[0] +[.] c['time'].coordinates[0] numpy.datetime64('2018-01-01') ``` @@ -98,18 +102,13 @@ Unlike `np.arange`: * the stop value will be included in the coordinates if it falls an exact number of steps from the start ``` +>>> import podpac >>> c = podpac.crange(0, 7, 2) >>> c.coordinates array([0., 2., 4., 6.]) -``` - -``` >>> c = podpac.crange(0, 8, 2) >>> c.coordinates array([0., 2., 4., 6., 8.]) -``` - -``` >>> c = podpac.crange('2018-01-01', '2018-03-01', '1,M') >>> c.coordinates array(['2018-01-01', '2018-02-01', '2018-03-01'], dtype='datetime64[D]') @@ -124,17 +123,13 @@ Unlike `np.linspace`: * tuple inputs are supported for stacked coordinates ``` +>>> import podpac >>> c = podpac.clinspace(0, 8, 5) >>> c.coordinates array([0., 2., 4., 6., 8.]) -``` - -```>>> c = podpac.clinspace('2018-01-01', '2018-03-01', 3) +>>> c = podpac.clinspace('2018-01-01', '2018-03-01', 3) >>> c.coordinates array(['2018-01-01', '2018-01-30', '2018-02-28'], dtype='datetime64[D]') -``` - -``` >>> c = podpac.clinspace((0, 10), (1, 20), 3) >>> c.coordinates MultiIndex(levels=[[0.0, 0.5, 1.0], [10.0, 15.0, 20.0]], @@ -157,6 +152,7 @@ TODO ctype, etc Unstacked coordinates can also be created using the `Coordinates.grid` alternate constructor: ``` +>>> from podpac import Coordinates >>> Coordinates.grid(lat=[0, 1, 2], lon=[10, 20, 30, 40]) Coordinates lat: ArrayCoordinates1d(lat): Bounds[0.0, 2.0], N[3], ctype['midpoint'] @@ -166,6 +162,7 @@ Coordinates Stacked coordinates can be created using the `Coordinates.points` alternate constructor: ``` +>>> from podpac import Coordinates >>> Coordinates.points(lat=[0, 1, 2], lon=[10, 20, 30]) Coordinates lat_lon[lat]: ArrayCoordinates1d(lat): Bounds[0.0, 2.0], N[3], ctype['midpoint'] @@ -194,6 +191,7 @@ Coordinates.points(lat=[0, 1, 2], lon=[10, 20, 30], order=['lat', 'lon']) TODO ``` +from podpac.coordinates import UniformCoordinates1d, ArrayCoordinates1d, Coordinates, StackedCoordinates >>> lat = UniformCoordinates1d(0, 1, size=100, name='lat') >>> lon = UniformCoordinates1d(10, 20, size=100, name='lon') >>> time = ArrayCoordinates1d(['2018-01-01', '2018-02-03'], name='time') @@ -213,6 +211,7 @@ TODO Coordinates contain some useful properties relating to its dimensions and underlying coordinate values. ``` +>>> from podpac import Coordinates >>> c = Coordinates([lat, lon, time], dims=['lat', 'lon', 'time']) >>> c.ndims >>> c.dims diff --git a/doc/source/dependencies.md b/doc/source/dependencies.md index ac299748a..800f22b4b 100644 --- a/doc/source/dependencies.md +++ b/doc/source/dependencies.md @@ -4,8 +4,7 @@ This document provides an overview of the dependencies leveraged by PODPAC. ## Requirements -- Python (3.6 or later) - - We suggest you use the the [Anaconda Python Distribution](https://www.anaconda.com/) +- [Python](https://www.python.org/) (3.6 or later) — [Anaconda Python Distribution](https://www.anaconda.com/distribution/#download-section) recommended ## OS Specific Requirements @@ -33,31 +32,44 @@ $ sudo apt-get install build-essential python-dev ## Core Dependencies -> See [requirements.txt](https://github.com/creare-com/podpac/blob/develop/requirements.txt) and [setup.py](https://github.com/creare-com/podpac/blob/develop/setup.py) for the latest dependencies listing. +> See [setup.py](https://github.com/creare-com/podpac/blob/master/setup.py) for the latest dependencies listing. + + "matplotlib>=2.1", + "numpy>=1.14", + "pint>=0.8", + "scipy>=1.0", + "traitlets>=4.3", + "xarray>=0.10", + "requests>=2.18", + "pyproj>=2.4", + "lazy-import>=0.2.2", + "psutil", -- Python 2.7\[[1](#f1)\], 3.5, 3.6, or 3.7 - - We suggest you use the the [Anaconda Python Distribution](https://www.anaconda.com/) - [numpy](http://www.numpy.org/), [scipy](https://www.scipy.org/), [xarray](http://xarray.pydata.org/en/stable/): array handling - [traitlets](https://github.com/ipython/traitlets): input and type handling - [pint](https://pint.readthedocs.io/en/latest/): unit handling - [requests](http://docs.python-requests.org/en/master/): HTTP requests +- [matplotlib](https://matplotlib.org/): plotting +- [pyproj](http://pyproj4.github.io/pyproj/stable/): coordinate reference system handling +- [psutil](https://psutil.readthedocs.io/en/latest/): cache management + ## Optional Dependencies -> Optional dependencies can be [installed using `pip`](insatllation.html#installing-via-pip) +> Optional dependencies can be [installed using `pip`](install.html#install-with-pip) -- Data Handling +- `datatype`: Data Handling - [h5py](https://www.h5py.org/): interface to the HDF5 data format - [pydap](http://www.pydap.org/en/latest/): python support for Data Access Protocol (OPeNDAP) - [rasterio](https://github.com/mapbox/rasterio): read GeoTiff and other raster datasets - [lxml](https://github.com/lxml/lxml): read xml and html files - [beautifulsoup4](https://www.crummy.com/software/BeautifulSoup/): text parser and screen scraper -- AWS +- `aws`: AWS integration - [awscli](https://github.com/aws/aws-cli): unified command line interface to Amazon Web Services - [boto3](https://boto3.amazonaws.com/v1/documentation/api/latest/index.html): Amazon Web Services (AWS) SDK for Python -- Algorithms +- `algorithm`: Algorithm development - [numexpr](https://github.com/pydata/numexpr): fast numerical expression evaluator for NumPy -- Notebook +- `notebook`: Jupyter Notebooks - [jupyterlab](https://github.com/jupyterlab/jupyterlab): extensible environment for interactive and reproducible computing - [ipyleaflet](https://github.com/jupyter-widgets/ipyleaflet): [LeafletJS](https://leafletjs.com/) interface for jupyter notebooks - [ipywidgets](https://ipywidgets.readthedocs.io/en/stable/): interactive widgets for Jupyter notebooks diff --git a/doc/source/deploy-notes.md b/doc/source/deploy-notes.md index 79caf955a..9a316574b 100644 --- a/doc/source/deploy-notes.md +++ b/doc/source/deploy-notes.md @@ -58,12 +58,12 @@ $ bin\activate_podpac_conda_env.bat # Install core dependencies $ conda install matplotlib>=2.1 numpy>=1.14 scipy>=1.0 traitlets>=4.3 xarray>=0.10 ipython psutil requests>=2.18 -$ conda install pyproj>=2.2 rasterio>=1.0 +$ conda install pyproj>=2.2 rasterio>=1.0 -c conda-forge $ pip install pint>=0.8 lazy-import>=0.2.2 # Install dependencies for handling various file datatype $ # conda install rasterio>=1.0 # Installed above alongside pyproj -$ conda install beautifulsoup4>=4.6 h5py>=2.9 lxml>=4.2 zarr>=2.3 +$ conda install beautifulsoup4>=4.6 h5py>=2.9 lxml>=4.2 zarr>=2.3 intake>=0.5 $ pip install pydap>=3.2 # Install dependencies for AWS diff --git a/doc/source/docs.md b/doc/source/docs.md index 38e3a7b5c..50db1f2de 100644 --- a/doc/source/docs.md +++ b/doc/source/docs.md @@ -7,8 +7,8 @@ The following sections outlines how to develop and build the `podpac` documentat - Install **Sphinx** and the **Read the Docs** theme ```bash -$ conda install sphinx # or `pip install sphinx` -$ conda install sphinx_rtd_theme # or `pip install sphinx-rtd-theme` +$ pip install sphinx +$ pip install sphinx_rtd_theme ``` - Install `recommonmark` to support markdown input files @@ -21,9 +21,11 @@ $ pip install recommonmark To run the doctests in the documentation, run from the `/doc` directory: -``` -$ ./test-docs.sh # convience script -or +```bash +$ ./test-docs.sh # convienence script + +# or + $ sphinx-build -b doctest source build # run tests manually ``` @@ -34,8 +36,10 @@ $ sphinx-build -b doctest source build # run tests manually To build the documentation into a website in the `doc/build` directory, run from the `/doc` directory: ```bash -$ make html -or +$ ./release-docs.sh # convienence script + +# or + $ sphinx-build source build # run manually $ sphinx-build -aE source build # rebuild all files (no cache) ``` @@ -48,14 +52,16 @@ To build a pdf from the documentation, you need to install a latex distribution ```bash $ make latex # build the docs into a tex format in a latex directory -or -$ sphinx-build -b latex source latex # run sphinx manually to build lated b + +# or + +$ sphinx-build -b latex source latex # run sphinx manually to build latex ``` Enter the build directory and convert tex file to pdf: ```bash -$ cd build # go into the latex directory +$ cd build # go into the latex directory $ pdflatex podpac.tex # build the pdf from podpac.tex entry file ``` @@ -63,15 +69,17 @@ $ pdflatex podpac.tex # build the pdf from podpac.tex entry file To live-serve the documentation as a website during development, you will need to add one more python package [`sphinx-autobuild`](https://github.com/GaretJax/sphinx-autobuild): -``` +```bash $ pip install sphinx-autobuild ``` Then run from the `doc` directory: -``` -$ ./serve-docs.sh # convience script -or +```bash +$ ./serve-docs.sh # convienence script + +# or + $ sphinx-autobuild source build # run manually $ sphinx-autobuild -aE source build # rebuild all files (no cache) ``` @@ -86,10 +94,8 @@ To stop the server simply press `^C`. + `/source/_templates` - templates to use for styling pages + `/source/_static` - static files that need to be copied over to distributed documentation (i.e. images, source code, etc) + `/source/conf.py` - sphinx configuration file - + `/source/index.rst` - root documentation file. Includes TOC - + `/source/user/api/` - auto generated API documentation using `sphinx-autogen` - + `/source/user/api-min/` - auto generated minimal API documentation using `sphinx-autogen` - + ... add others as generated ... + + `/source/index.rst` - root documentation file + + `/source/api/` - auto generated API documentation using `sphinx-autogen` - `/build` - generated documentation files ## References diff --git a/doc/source/earthdata.md b/doc/source/earthdata.md index 197095d71..e338864a5 100644 --- a/doc/source/earthdata.md +++ b/doc/source/earthdata.md @@ -3,6 +3,7 @@ This document describes using an Earth Data Account with PODPAC. ## Motivation + * An Earth Data Login account is needed to access the wealth of data provided by NASA. * PODPAC automatically retrieves this data using OpenDAP. @@ -11,10 +12,11 @@ NASA. * save their credentials as part of a PODPAC settings files * provide their username and password to the child class of a PyDAP node at runtime -## Creating an EarthData Login Account -* Go to the [EarthData Registration Page](https://urs.earthdata.nasa.gov/users/new) +## Creating an Earthdata Login Account + +* Go to the [Earthdata Registration Page](https://urs.earthdata.nasa.gov/users/new) page and follow the instructions to register an account -* Go to the [EarthData Login Page](https://urs.earthdata.nasa.gov/) to log into +* Go to the [Earthdata Login Page](https://urs.earthdata.nasa.gov/) to log into your account * To enable OpenDAP access: * Go to your [Profile](https://urs.earthdata.nasa.gov/profile) once logged in. @@ -23,42 +25,37 @@ NASA. * Find the `NASA GESDICS DATA ARCHIVE`, `NSIDC V0 OPeNDAP`, and `NSIDC_DATAPOOL_OPS` applications * Additional applications may be required to access datasets of interest * For each, click the `APPROVE` button -* At this stage, your EarthData account should be set up and ready to use for +* At this stage, your Earthdata account should be set up and ready to use for accessing data through OpenDAP -## Saving EarthData Credentials in PODPAC Settings -For convenience, PODPAC can store your EarthData login details so that you do -not have to provide your username and password at run time or in every script. +## Using Earthdata Credentials in PODPAC + + +PODPAC uses Earthdata credentials to access the SMAP data source nodes. +You can store the credentials for SMAP nodes using the `Node` method `set_credentials`. -> **NOTE:** PODPAC stores your credentials in a plain text file. If this is -a security issue for you, do not use this method. +To store credentials for SMAP nodes, use the following code in an interactive Python session: -To store your credentials use the following code in an interactive Python session: ```python -from podpac.core.authentication import EarthDataSession -eds = EarthDataSession() -eds.update_login() +from podpac.datalib import SMAP + +node = SMAP() +node.set_credentials(username="", password="") ``` -Then follow the on-screen prompts to enter our username and password. -Your credentials will be saved in `$HOME\.podpac\settings.json` -where `$HOME` is usually `C:\users\` on Windows and `/home/` -on Linux systems. +The `set_credentials` method stores credentials for a Node in the PODPAC settings. +To persistently save the credentials in the PODPAC settings +(to avoid running `set_credentials` at runtime or in a script), run `settings.save()`: -## Setting credentials at Runtime -To set credentials at runtime, you can either provide an authenticated session -or the username and password to the PyDAP node or child node. For example +> **NOTE:** PODPAC stores credentials in plain text. +> Be conscious of outputting the PODPAC settings to a file when it contains credentials. -```python -from podpac.authentication import EarthDataSession -eds = EarthDataSession(username=, password=) -from podpac.data import PyDAP -pydap_node = PyDAP(source=, auth_session=eds) ``` +from podpac import settings -Or - -```python -from podpac.data import PyDAP -pydap_node = PyDAP(source=, username=, password=) +settings.save() ``` + +Your credentials will be saved in `$HOME\.podpac\settings.json` +where `$HOME` is usually `C:\users\` on Windows and `/home/` +on Linux systems. diff --git a/doc/source/examples.rst b/doc/source/examples.rst index 489bcc532..aa62c6ab6 100644 --- a/doc/source/examples.rst +++ b/doc/source/examples.rst @@ -140,7 +140,7 @@ configured. output = node.eval(coords) -`Notebooks `__ +Notebooks --------- Interactive PODPAC examples are distributed as `example Jupyter diff --git a/doc/source/install.md b/doc/source/install.md index 3d569c1a6..eb335f5d4 100644 --- a/doc/source/install.md +++ b/doc/source/install.md @@ -5,9 +5,9 @@ PODPAC is available for Windows, Mac, and Linux. Select the installation method the best suits your development environment: - [pip](#install-with-pip): Recommended for most users -- [Docker](#install-with-docker): For use in containers +- [Docker](#docker): For use in containers - [Install from source](#install-from-source): For development -- [Standalone distribution](#standalone-distribution): Includes Python and all dependencies +- [Standalone distribution](#standalone-distibution): Includes Python and all dependencies ## Install with pip @@ -15,8 +15,7 @@ Select the installation method the best suits your development environment: Confirm you have the required dependencies installed on your computer: -- [Python](https://www.python.org/) (3.6 or later) - - We recommend the [Anaconda Python Distribution](https://www.anaconda.com/distribution/#download-section) +- [Python](https://www.python.org/) (3.6 or later) — [Anaconda Python Distribution](https://www.anaconda.com/distribution/#download-section) recommended - See [operating system requirements](dependencies.html#os-specific-requirements) ### Environment @@ -94,7 +93,6 @@ The Window 10 standalone distribution requires no pre-installed operating system - For older versions, substitute `latest` in the url with the version number, i.e. `PODPAC_1.2.0_install_windows10.zip` - Once downloaded, extract the zip file into a folder on your machine. - We recommend expanding it near the root of your drive (e.g. `C:\PODPAC`) due to long file paths that are part of the installation. - - We also recommend *unblocking* the file to speed up the unzipping process. See this [Microsoft Developer Entry](https://blogs.msdn.microsoft.com/delay/p/unblockingdownloadedfile/). Once the folder is unzipped: diff --git a/doc/source/pipelines.md b/doc/source/pipelines.md deleted file mode 100644 index d4c8dd92f..000000000 --- a/doc/source/pipelines.md +++ /dev/null @@ -1,275 +0,0 @@ -# Pipelines - -*DEPRECATED: This functionality has been integrated into the Node class. Pipelines will be removed in popdac 2* - -## Introduction -A podpac pipeline can be defined using JSON. The pipeline definition describes the *nodes* used in the pipeline and the *output* for the pipeline. - -### Attributes - - * `nodes`: node definitions *(object, required)* - * `output`: output definition *(object, optional)* - -### Sample - -``` -{ - "nodes": { - "myNode": { ... }, - "myOtherNode": { ... } - ... - "myResult": { ... } - }, - "output": { - "node": "myResult", - "mode": "file", - ... - } -} -``` - -## Node definitions - -A node definition defines the node and its inputs, attributes, and default execution parameters. It also names the node so that it can be used as an input to other nodes in the pipeline. Nodes must be defined before they are referenced in a later node. - -The podpac core library includes three basic types of nodes: *DataSource*, *Compositor*, and *Algorithm*. A *Pipeline* node can also be used an an input to a pipeline. These nodes and their additional attributes are described below. - -### Common Attributes - - * `node`: a path to the node class. The path is relative to the podpac module, unless `plugin` is defined. See Notes. *(string, required)* - * `plugin`: a path to a plugin module to use (prepended node path). See Notes. *(string, optional)* - * `attrs`: set attributes in the node for custom behavior. Each value can be a number, string, boolean, dictionary, or list. *(object, optional)* - -## DataSource - -### Sample - -``` -{ - "nodes": { - "sm": { - "node": "algorithm.CoordData", - "attrs": { - "coord_name": "time" - } - } - } -} -``` - -## Compositor - -### Additional Attributes - - * `sources`: nodes to composite *(list, required)* - -### Sample - -``` -{ - "nodes": { - "SourceA": { ... }, - "SourceB": { ... }, - "SourceC": { ... }, - - MyCompositor": { - "node": "OrderedCompositor", - "sources": ["SourceA", "SourceB", "SourceC"] - } - } -} -``` - -## Algorithm - -### Additional Attributes - * `inputs`: node inputs to the algorithm. *(object, required)* - -### Sample - -``` -{ - "nodes": { - "MyNode": { ... }, - "MyOtherNode": { ... }, - "MyThirdNode": { ... }, - - "downscaled_sm": { - "node": "Arithmetic", - "inputs": { - "A": "MyNode", - "B": "MyOtherNode", - "C": "MyThirdNode" - }, - "attrs": { - "eqn": "A + {tsmtr} / {kappa} * (B - C)", - "params": { - "kappa": "13", - "tsmtr": "0.3" - } - } - } - } -} -``` - -## Pipeline - -### Additional Attributes - * `path`: path to another pipeline JSON definition. *(string, required)* - -### Sample - -``` -{ - "nodes": { - "MyDataSource": { - ... - }, - - "MyOtherPipeline": { - "path": "path to pipeline" - }, - - "result": { - "node": "Arithmetic", - "inputs": { - "A": "MyDataSource", - "B": "MyOtherPipeline", - }, - "attrs": { - "eqn": "A + B" - } - } - } -} -``` - -### Notes - - * The `node` path should include the submodule path and the node class. The submodule path is omitted for top-level classes. For example: - - `"node": "datalib.smap.SMAP"` is equivalent to `from podpac.datalib.smap import SMAP`. - - `"node": "compositor.OrderedCompositor"` is equivalent to `from podpac.compositor import OrderedCompositor`. - * The `plugin` path replaces 'podpac' in the full node path. For example - - `"plugin": "path.to.myplugin", "node": "mymodule.MyCustomNode"` is equivalent to `from path.to.myplugin.mymodule import MyCustomNode`. - - `"plugin": "myplugin", "node": "MyCustomNode"` is equivalent to `from myplugin import MyCustomNode` - -## Output Definition - -The output definition defines the node to output and, optionally, an additional output mode along with associated parameters. If an output definition is not supplied, the last defined node is used. - -Podpac provides several builtin output types, *file* and *image*. You can also define custom outputs in a plugins. - -### Common Attributes - - * `node`: The nodes to output. *(list, required)* - * `mode`: For builtin outputs, options are 'none' (default), 'file', 'image'. *(string, optional)* - -## None (default) - -No additional output. The output will be returned from the `Pipeline.execute` method. - -## Files - -Nodes can be output to file in a variety of formats. - -### Additional Attributes - - * `format`: file format, options are 'pickle' (default), 'geotif', 'png'. *(string, optional)* - * `outdir`: destination path for the output file *(string, required)* - -### Sample - -``` -{ - "nodes": { - "MyNode1": { ... }, - "MyNode2": { ... } - }, - - "output": { - "nodes": "MyNode2", - "mode": "file", - "format": "png", - "outdir": "C:\Path\To\OutputData" - } -} -``` - -## Images - -Nodes can be output to a png image (in memory). - -### Additional Attributes - - * `format`: image format, options are 'png' (default). *(string, optional)* - * `vmin`: min value for the colormap *(number, optional)* - * `vmax`: max value for the colormap *(number, optional)* - -### Sample - -``` -{ - "nodes": { - "MyNode1": { ... }, - "MyNode2": { ... } - }, - - "output": { - "nodes": "MyNode2", - "mode": "image", - "format": "png", - "vmin": 0.1, - "vmax": 0.35 - } -} -``` - -## Custom Outputs - -Custom outputs can be defined in a plugin by subclassing the `Output` base class found in `core.pipeline.output`. Custom -outputs must define the `write` method with no arguments, and may define additional parameters. - -### Attributes - -Replace the 'mode' parameter with a plugin path and output class name: - - * `plugin`: path to a plugin module to use *(string, required)* - * `output`: output class name *(string, required)* - -### Sample Custom Output Class - -File: **my_plugin/outputs.py** - -``` -import numpy as np -import traitlets as tl -import podpac - -class NpyOutput(podpac.core.pipeline.output.Output): - path = tl.String() - allow_pickle = tl.Bool(True) - fix_imports = tl.Bool(True) - - def write(self): - numpy.save(self.path, self.node.output.data, allow_pickle=self.allow_pickle, fix_imports=self.fix_imports) -``` - -### Sample Pipeline - -``` -{ - "nodes": { - "MyNode1": { ... }, - "MyNode2": { ... } - }, - - "output": { - "nodes": "MyNode2", - "plugin": "my_plugin", - "output": "NpyOutput", - "path": "my_pipeline_output.npy", - "allow_pickle": false - } -} -``` diff --git a/doc/source/settings.md b/doc/source/settings.md index c6e56a67a..484e96a3d 100644 --- a/doc/source/settings.md +++ b/doc/source/settings.md @@ -4,7 +4,7 @@ This tutorial describes methods for viewing and editing PODPAC settings used to To follow along, open a Python interpreter or Jupyter notebook in the Python environment where PODPAC is installed. -``` +```bash # activate the PODPAC environment, using anaconda $ conda activate podpac diff --git a/doc/source/specs/nodes.md b/doc/source/specs/nodes.md index 1005717e1..6a261177a 100644 --- a/doc/source/specs/nodes.md +++ b/doc/source/specs/nodes.md @@ -82,17 +82,17 @@ def eval(coordinates, output=None, method=None): **Notes on what's returned: ** * This function should always return a `UnitsDataArray` object. ```python - >>> node.native_coordinates = Coordinate(lat=(90, -90, -1.), lon=(-180, 180, 2.), order=['lat', 'lon']) - >>> type(node.initialize_output_array()) + [.] node.native_coordinates = Coordinate(lat=(90, -90, -1.), lon=(-180, 180, 2.), order=['lat', 'lon']) + [.] type(node.initialize_output_array()) podpac.core.units.UnitsDataArray ``` * This `UnitsDataArray` may contain the following dimensions `['lat', 'lon', 'time', 'alt']` * For cases with multiple outputs, it may additionally contain the field `band` * This is to supported datasets such as multi-spectral imagery ```python - >>> grey = UnitsDataArray(np.ones((2, 1)), dims=['lat', 'lon'], coords=[[0, 1], [0]]) - >>> rgba = UnitsDataArray(np.ones((2, 1, 4)), dims=['lat', 'lon', 'band'], coords=[[0, 1], [0], ['r', 'g', 'b', 'a']]) - >>> grey + rgba + grey = UnitsDataArray(np.ones((2, 1)), dims=['lat', 'lon'], coords=[[0, 1], [0]]) + rgba = UnitsDataArray(np.ones((2, 1, 4)), dims=['lat', 'lon', 'band'], coords=[[0, 1], [0], ['r', 'g', 'b', 'a']]) + grey + rgba array([[[2., 2., 2., 2.]], @@ -104,12 +104,12 @@ def eval(coordinates, output=None, method=None): ``` * This requres that all the bands have the same `units` ```python - >>> grey = UnitsDataArray(np.ones((2, 1)), dims=['lat', 'lon'], + grey = UnitsDataArray(np.ones((2, 1)), dims=['lat', 'lon'], coords=[[0, 1], [0]], attrs={'units': ureg.m}) - >>> rgba1 = UnitsDataArray(np.ones((2, 1, 4)), dims=['lat', 'lon', 'band'], + rgba1 = UnitsDataArray(np.ones((2, 1, 4)), dims=['lat', 'lon', 'band'], coords=[[0, 1], [0], ['r', 'g', 'b', 'a']], attrs={'units': ureg.km}) - >>> grey + rgba1 + grey + rgba1 array([[[1001., 1001., 1001., 1001.]], @@ -123,12 +123,12 @@ def eval(coordinates, output=None, method=None): ``` * [**FUTURE FEATURE**] We could support different units for differents bands as follows, but the dimensionality still has to be consistent, unless a specific band is selected ```python - >>> rgba2 = UnitsDataArray(np.ones((2, 1, 4)), dims=['lat', 'lon', 'band'], + [.] rgba2 = UnitsDataArray(np.ones((2, 1, 4)), dims=['lat', 'lon', 'band'], coords=[[0, 1], [0], ['r', 'g', 'b', 'a']], attrs={'units': {'r': ureg.m, 'g': ureg.ft, 'b': ureg.km, 'a': ureg.mile}}) - >>> grey + rgba2 + [.] grey + rgba2 array([[[2., 1.3048, 1001., 1610.344]], @@ -143,17 +143,17 @@ def eval(coordinates, output=None, method=None): * Dimensions should be returned in the order of the requested `coordinates` * eg. if underlying dataset has `['lat', 'lon']` `coordinates`, but the request has `['lon', 'lat']` coordinates, the output should match `['lon', 'lat']` ```python - >>> node.native_coordinates = Coordinate(lat=(90, -90, -1.), lon=(-180, 180, 2.), order=['lat', 'lon']) - >>> node.evaluated_coordinates = Coordinate(lon=(-180, 180, 4.), lat=(90, -90, -2.), order=['lon', 'lat']) - >>> node.initialize_output_array().dims + [.] node.native_coordinates = Coordinate(lat=(90, -90, -1.), lon=(-180, 180, 2.), order=['lat', 'lon']) + [.] node.evaluated_coordinates = Coordinate(lon=(-180, 180, 4.), lat=(90, -90, -2.), order=['lon', 'lat']) + [.] node.initialize_output_array().dims ('lon', 'lat') ``` * If the underlying Node has unstacked dimensions not in the request, an exception is raised * eg. Node has `['lat', 'lon', 'time']` dimensions, but coordinates only have `['time']` ```python - >>> node.native_coordinates = Coordinate(lat=(90, -90, -1.), lon=(-180, 180, 2.), order=['lat', 'lon']) - >>> node.evaluated_coordinates = Coordinate(lon=(-180, 180, 4.),) - >>> node.initialize_output_array() + [.] node.native_coordinates = Coordinate(lat=(90, -90, -1.), lon=(-180, 180, 2.), order=['lat', 'lon']) + [.] node.evaluated_coordinates = Coordinate(lon=(-180, 180, 4.),) + [.] node.initialize_output_array() CoordinateError: 'Dimension "lat" not present in requested coordinates with dims ["lon"]' ``` * Because some datasets may be huge, and without information about the subset, the safest behaviour is to throw an exception @@ -162,9 +162,9 @@ def eval(coordinates, output=None, method=None): * If the request has unstacked dimensions not in the Node, just return without those dimensions * eg. Node has `['lat', 'lon']` dimensions, but evaluated coordinates have `['time', 'lon', 'lat']` then `UnitsDataArray` will have dimensions `['lon', 'lat']` ```python - >>> node.native_coordinates = Coordinate(lat=45, lon=0, order=['lat', 'lon']) - >>> node.evaluated_coordinates = Coordinate(lat=45, lon=0, time='2018-01-01', order=['lat', 'lon', 'time']) - >>> node.initialize_output_array() + [.] node.native_coordinates = Coordinate(lat=45, lon=0, order=['lat', 'lon']) + [.] node.evaluated_coordinates = Coordinate(lat=45, lon=0, time='2018-01-01', order=['lat', 'lon', 'time']) + [.] node.initialize_output_array() array([[nan]]) Coordinates: @@ -229,9 +229,9 @@ def find_coordinates(dims=None, bounds=None, number=None, sortby='size', stop_ty Notes ------ The `native_coordinates` could be retrieved using (for example): - >>> c = node.find_coordinates('time') - >>> node_key = list(c['time'].keys())[0] - >>> node[node_key].native_coordinates + [.] c = node.find_coordinates('time') + [.] node_key = list(c['time'].keys())[0] + [.] node[node_key].native_coordinates ''' pass ``` diff --git a/podpac/__init__.py b/podpac/__init__.py index 483c4daf3..6943ce2b9 100644 --- a/podpac/__init__.py +++ b/podpac/__init__.py @@ -42,20 +42,20 @@ def makedirs(name, mode=511, exist_ok=False): from podpac.core.settings import settings from podpac.core.coordinates import Coordinates, crange, clinspace from podpac.core.node import Node, NodeException -import podpac.core.authentication as authentication -from podpac.core.utils import NodeTrait +from podpac.core.utils import cached_property from podpac.core.units import ureg as units, UnitsDataArray # Organized submodules # These files are simply wrappers to create a curated namespace of podpac modules from podpac import algorithm +from podpac import authentication from podpac import data from podpac import interpolators from podpac import coordinates from podpac import compositor -from podpac import pipeline from podpac import managers from podpac import utils +from podpac import style ## Developer API from podpac import core diff --git a/podpac/alglib/__init__.py b/podpac/alglib/__init__.py new file mode 100644 index 000000000..fad0d686b --- /dev/null +++ b/podpac/alglib/__init__.py @@ -0,0 +1,8 @@ +""" +Datalib Public API + +This module gets imported in the root __init__.py +and exposed its contents to podpac.datalib +""" + +from podpac.alglib import climatology diff --git a/podpac/alglib/climatology.py b/podpac/alglib/climatology.py new file mode 100644 index 000000000..19e36eb4f --- /dev/null +++ b/podpac/alglib/climatology.py @@ -0,0 +1,62 @@ +""" +PODPAC node to compute beta fit of seasonal variables +""" + +import logging +import numpy as np +import xarray as xr +import traitlets as tl +from lazy_import import lazy_module +from scipy.stats import beta +from scipy.stats._continuous_distns import FitSolverError + +# optional imports +h5py = lazy_module("h5py") + +# Internal dependencies +import podpac +from podpac.core.algorithm.stats import DayOfYearWindow + +# Set up logging +_log = logging.getLogger(__name__) + + +class BetaFitDayOfYear(DayOfYearWindow): + """ + This fits a beta distribution to day of the year in the requested coordinates over a window. It returns the beta + distribution parameters 'a', and 'b' as part of the output. It may also return a number of percentiles. + + Attributes + ----------- + percentiles: list, optional + Default is []. After computing the beta distribution, optionally compute the value of the function for the given + percentiles in the list. The results will be available as an output named ['d0', 'd1',...] for each entry in + the list. + """ + + percentiles = tl.List().tag(attr=True) + rescale = tl.Bool(True).tag(attr=True) + + @property + def outputs(self): + return ["a", "b"] + ["d{}".format(i) for i in range(len(self.percentiles))] + + def function(self, data, output): + # define the fit function + try: + a, b, loc, scale = beta.fit(data, floc=0, fscale=1) + except FitSolverError as e: + print(e) + return output + + # populate outputs for this point + output.loc[{"output": "a"}] = a + output.loc[{"output": "b"}] = b + for ii, d in enumerate(self.percentiles): + output.loc[{"output": "d" + str(ii)}] = beta.ppf(d, a, b) + + return output + + def rescale_outputs(self, output, scale_max, scale_min): + output[..., 2:] = (output[..., 2:] * (scale_max - scale_min)) + scale_min + return output diff --git a/podpac/algorithm.py b/podpac/algorithm.py index 3501834a5..742250143 100644 --- a/podpac/algorithm.py +++ b/podpac/algorithm.py @@ -20,6 +20,7 @@ Kurtosis, DayOfYear, GroupReduce, + ResampleReduce, ) from podpac.core.algorithm.coord_select import ExpandCoordinates, SelectCoordinates, YearSubstituteCoordinates from podpac.core.algorithm.signal import Convolution, SpatialConvolution, TimeConvolution diff --git a/podpac/authentication.py b/podpac/authentication.py new file mode 100644 index 000000000..2bef51622 --- /dev/null +++ b/podpac/authentication.py @@ -0,0 +1,8 @@ +""" +Authentication Public Module +""" + +# REMINDER: update api docs (doc/source/api.rst) to reflect changes to this file + + +from podpac.core.authentication import RequestsSessionMixin, S3Mixin diff --git a/podpac/compositor.py b/podpac/compositor.py index 418c3c5ae..166dcf323 100644 --- a/podpac/compositor.py +++ b/podpac/compositor.py @@ -4,4 +4,5 @@ # REMINDER: update api docs (doc/source/user/api.rst) to reflect changes to this file -from podpac.core.compositor import Compositor, OrderedCompositor +from podpac.core.compositor.ordered_compositor import OrderedCompositor +from podpac.core.compositor.tile_compositor import UniformTileCompositor, UniformTileMixin diff --git a/podpac/core/algorithm/algorithm.py b/podpac/core/algorithm/algorithm.py index d770ae05c..a9ed89e3d 100644 --- a/podpac/core/algorithm/algorithm.py +++ b/podpac/core/algorithm/algorithm.py @@ -29,40 +29,17 @@ class BaseAlgorithm(Node): """ @property - def _inputs(self): - # this first version is nicer, but the gettattr(self, ref) can take a - # a long time if it is has a default value or is a property - - # return = { - # ref:getattr(self, ref) - # for ref in self.trait_names() - # if isinstance(getattr(self, ref, None), Node) - # } - + def inputs(self): + # gettattr(self, ref) can take a long time, so we inspect trait.klass instead return { ref: getattr(self, ref) for ref, trait in self.traits().items() if hasattr(trait, "klass") and Node in inspect.getmro(trait.klass) and getattr(self, ref) is not None } - @property - def base_definition(self): - """Base node definition. - - Returns - ------- - OrderedDict - Extends base description by adding 'inputs' - """ - - d = super(BaseAlgorithm, self).base_definition - inputs = self._inputs - d["inputs"] = OrderedDict([(key, inputs[key]) for key in sorted(inputs.keys())]) - return d - def find_coordinates(self): """ - Get the available native coordinates for the inputs to the Node. + Get the available coordinates for the inputs to the Node. Returns ------- @@ -70,7 +47,7 @@ def find_coordinates(self): list of available coordinates (Coordinate objects) """ - return [c for node in self._inputs.values() for c in node.find_coordinates()] + return [c for node in self.inputs.values() for c in node.find_coordinates()] class Algorithm(BaseAlgorithm): @@ -117,7 +94,7 @@ def eval(self, coordinates, output=None): inputs = {} if settings["MULTITHREADING"]: - n_threads = thread_manager.request_n_threads(len(self._inputs)) + n_threads = thread_manager.request_n_threads(len(self.inputs)) if n_threads == 1: thread_manager.release_n_threads(n_threads) else: @@ -132,13 +109,13 @@ def f(node): pool = thread_manager.get_thread_pool(processes=n_threads) # Evaluate nodes in parallel/asynchronously - results = [pool.apply_async(f, [node]) for node in self._inputs.values()] + results = [pool.apply_async(f, [node]) for node in self.inputs.values()] # Collect the results in dictionary - for key, res in zip(self._inputs.keys(), results): + for key, res in zip(self.inputs.keys(), results): inputs[key] = res.get() - # This prevents any more tasks from being submitted to the pool, and will close the workers one done + # This prevents any more tasks from being submitted to the pool, and will close the workers once done pool.close() # Release these number of threads back to the thread pool @@ -146,7 +123,7 @@ def f(node): self._multi_threaded = True else: # Evaluate nodes in serial - for key, node in self._inputs.items(): + for key, node in self.inputs.items(): inputs[key] = node.eval(coordinates) self._multi_threaded = False @@ -159,7 +136,7 @@ def f(node): if output is None: output = result else: - output[:] = result + output[:] = result.data[:] elif isinstance(result, xr.DataArray): if output is None: output = self.create_output_array( @@ -195,7 +172,10 @@ class UnaryAlgorithm(BaseAlgorithm): Developers of new Algorithm nodes need to implement the `eval` method. """ - source = NodeTrait() + source = NodeTrait().tag(attr=True) + + # list of attribute names, used by __repr__ and __str__ to display minimal info about the node + _repr_keys = ["source"] @tl.default("outputs") def _default_outputs(self): diff --git a/podpac/core/algorithm/coord_select.py b/podpac/core/algorithm/coord_select.py index db75abaad..00c677662 100644 --- a/podpac/core/algorithm/coord_select.py +++ b/podpac/core/algorithm/coord_select.py @@ -32,7 +32,7 @@ class ModifyCoordinates(UnaryAlgorithm): Modification parameters for given dimension. Varies by node. """ - coordinates_source = NodeTrait() + coordinates_source = NodeTrait().tag(attr=True) lat = tl.List().tag(attr=True) lon = tl.List().tag(attr=True) time = tl.List().tag(attr=True) @@ -67,7 +67,9 @@ def eval(self, coordinates, output=None): self._requested_coordinates = coordinates self._modified_coordinates = Coordinates( - [self.get_modified_coordinates1d(coordinates, dim) for dim in coordinates.dims], crs=coordinates.crs + [self.get_modified_coordinates1d(coordinates, dim) for dim in coordinates.dims], + crs=coordinates.crs, + validate_crs=False, ) for dim in self._modified_coordinates.udims: @@ -106,9 +108,16 @@ class ExpandCoordinates(ModifyCoordinates): Expansion parameters for the given dimension: The options are:: * [start_offset, end_offset, step] to expand uniformly around each input coordinate. * [start_offset, end_offset] to expand using the available source coordinates around each input coordinate. + + bounds_only: bool + Default is False. If True, will only expand the bounds of the overall coordinates request. Otherwise, it will + expand around EACH coordinate in the request. For example, with bounds_only == True, and an expansion of 3 + you may expand [5, 6, 8] to [2, 3, 4, 5, 6, 7, 8, 9, 10, 11], whereas with bounds_only == False, it becomes + [[2, 5, 8], [3, 6, 9], [5, 8, 11]] (brackets added for clarity, they will be concatenated). """ substitute_eval_coords = tl.Bool(False, read_only=True) + bounds_only = tl.Bool(False).tag(attr=True) def get_modified_coordinates1d(self, coords, dim): """Returns the expanded coordinates for the requested dimension, depending on the expansion parameter for the @@ -133,7 +142,7 @@ def get_modified_coordinates1d(self, coords, dim): return coords1d if len(expansion) == 2: - # use available native coordinates + # use available coordinates dstart = make_coord_delta(expansion[0]) dstop = make_coord_delta(expansion[1]) @@ -141,14 +150,30 @@ def get_modified_coordinates1d(self, coords, dim): if len(available_coordinates) != 1: raise ValueError("Cannot implicity expand coordinates; too many available coordinates") acoords = available_coordinates[0][dim] - cs = [acoords.select((add_coord(x, dstart), add_coord(x, dstop))) for x in coords1d.coordinates] + if self.bounds_only: + cs = [ + acoords.select( + add_coord(coords1d.coordinates[0], dstart), add_coord(coords1d.coordinates[-1], dstop) + ) + ] + else: + cs = [acoords.select((add_coord(x, dstart), add_coord(x, dstop))) for x in coords1d.coordinates] elif len(expansion) == 3: # use a explicit step size dstart = make_coord_delta(expansion[0]) dstop = make_coord_delta(expansion[1]) step = make_coord_delta(expansion[2]) - cs = [UniformCoordinates1d(add_coord(x, dstart), add_coord(x, dstop), step) for x in coords1d.coordinates] + if self.bounds_only: + cs = [ + UniformCoordinates1d( + add_coord(coords1d.coordinates[0], dstart), add_coord(coords1d.coordinates[-1], dstop), step + ) + ] + else: + cs = [ + UniformCoordinates1d(add_coord(x, dstart), add_coord(x, dstop), step) for x in coords1d.coordinates + ] else: raise ValueError("Invalid expansion attrs for '%s'" % dim) diff --git a/podpac/core/algorithm/generic.py b/podpac/core/algorithm/generic.py index fd1bccbd3..a29b5cd2b 100644 --- a/podpac/core/algorithm/generic.py +++ b/podpac/core/algorithm/generic.py @@ -4,6 +4,7 @@ from __future__ import division, unicode_literals, print_function, absolute_import +import sys import warnings import numpy as np @@ -21,15 +22,18 @@ from podpac.core.utils import NodeTrait from podpac.core.algorithm.algorithm import Algorithm +if sys.version_info.major == 2: + + class PermissionError(OSError): + pass + class GenericInputs(Algorithm): """Base class for Algorithms that accept generic named inputs.""" - inputs = tl.Dict() + inputs = tl.Dict(read_only=True) - @property - def _inputs(self): - return self.inputs + _repr_keys = ["inputs"] def _first_init(self, **kwargs): trait_names = self.trait_names() @@ -38,7 +42,14 @@ def _first_init(self, **kwargs): raise RuntimeError("Trait '%s' is reserved and cannot be used as an Generic Algorithm input" % key) input_keys = [key for key in kwargs if key not in trait_names and isinstance(kwargs[key], Node)] inputs = {key: kwargs.pop(key) for key in input_keys} - return super(GenericInputs, self)._first_init(inputs=inputs, **kwargs) + self.set_trait("inputs", inputs) + return super(GenericInputs, self)._first_init(**kwargs) + + @property + def _base_definition(self): + d = super(GenericInputs, self)._base_definition + d["inputs"] = self.inputs + return d class Arithmetic(GenericInputs): @@ -54,6 +65,8 @@ class Arithmetic(GenericInputs): eqn = tl.Unicode().tag(attr=True) params = tl.Dict().tag(attr=True) + _repr_keys = ["eqn"] + def init(self): if not settings.allow_unsafe_eval: warnings.warn( @@ -191,13 +204,15 @@ class Mask(Algorithm): """ - source = NodeTrait() - mask = NodeTrait() + source = NodeTrait().tag(attr=True) + mask = NodeTrait().tag(attr=True) masked_val = tl.Float(np.nan).tag(attr=True) bool_val = tl.Float(1).tag(attr=True) bool_op = tl.Enum(["==", "<", "<=", ">", ">="], default_value="==").tag(attr=True) in_place = tl.Bool(False).tag(attr=True) + _repr_keys = ["source", "mask"] + def algorithm(self, inputs): """ Sets the values in inputs['source'] to self.masked_val using (inputs['mask'] ) """ @@ -237,7 +252,8 @@ class Combine(GenericInputs): @tl.default("outputs") def _default_outputs(self): - return list(self.inputs.keys()) + input_keys = list(self.inputs.keys()) + return input_keys def algorithm(self, inputs): return np.stack([inputs[key] for key in self.inputs], axis=-1) diff --git a/podpac/core/algorithm/signal.py b/podpac/core/algorithm/signal.py index 7354f8176..933a462a9 100644 --- a/podpac/core/algorithm/signal.py +++ b/podpac/core/algorithm/signal.py @@ -138,12 +138,12 @@ def eval(self, coordinates, output=None): add_coord(coord.start, s_start * coord.step), add_coord(coord.stop, s_end * coord.step + 1e-07 * coord.step), coord.step, - **coord.properties, + **coord.properties ) ) exp_slice.append(slice(-s_start, -s_end)) exp_slice = tuple(exp_slice) - expanded_coordinates = Coordinates(exp_coords) + expanded_coordinates = Coordinates(exp_coords, crs=coordinates.crs, validate_crs=False) if settings["DEBUG"]: self._expanded_coordinates = expanded_coordinates diff --git a/podpac/core/algorithm/stats.py b/podpac/core/algorithm/stats.py index 86d2ca8dc..60ee65baa 100644 --- a/podpac/core/algorithm/stats.py +++ b/podpac/core/algorithm/stats.py @@ -7,6 +7,7 @@ import warnings from operator import mul from functools import reduce +import logging import xarray as xr import numpy as np @@ -14,15 +15,19 @@ import traitlets as tl from six import string_types +# Internal dependencies import podpac from podpac.core.coordinates import Coordinates from podpac.core.node import Node -from podpac.core.algorithm.algorithm import UnaryAlgorithm +from podpac.core.algorithm.algorithm import UnaryAlgorithm, Algorithm from podpac.core.utils import common_doc, NodeTrait from podpac.core.node import COMMON_NODE_DOC, node_eval COMMON_DOC = COMMON_NODE_DOC.copy() +# Set up logging +_log = logging.getLogger(__name__) + class Reduce(UnaryAlgorithm): """Base node for statistical algorithms @@ -45,21 +50,6 @@ def _first_init(self, **kwargs): kwargs["dims"] = [kwargs["dims"]] return super(Reduce, self)._first_init(**kwargs) - def _get_dims(self, out): - """ - Translates requested reduction dimensions. - - Parameters - ---------- - out : UnitsDataArray - The output array - - Returns - ------- - list - List of dimensions after reduction - """ - def dims_axes(self, output): """Finds the indices for the dimensions that will be reduced. This is passed to numpy. @@ -840,6 +830,8 @@ def reduce(self, x): # Time-Grouped Reduce # ============================================================================= +_REDUCE_FUNCTIONS = ["all", "any", "count", "max", "mean", "median", "min", "prod", "std", "sum", "var", "custom"] + class GroupReduce(UnaryAlgorithm): """ @@ -857,15 +849,13 @@ class GroupReduce(UnaryAlgorithm): Source node """ - coordinates_source = NodeTrait(allow_none=True) + _repr_keys = ["source", "groupby", "reduce_fn"] + coordinates_source = NodeTrait(allow_none=True).tag(attr=True) # see https://github.com/pydata/xarray/blob/eeb109d9181c84dfb93356c5f14045d839ee64cb/xarray/core/accessors.py#L61 - groupby = tl.CaselessStrEnum(["dayofyear"]) # could add season, month, etc - - reduce_fn = tl.CaselessStrEnum( - ["all", "any", "count", "max", "mean", "median", "min", "prod", "std", "sum", "var", "custom"] - ) - custom_reduce_fn = tl.Any() + groupby = tl.CaselessStrEnum(["dayofyear", "weekofyear", "season", "month"], allow_none=True).tag(attr=True) + reduce_fn = tl.CaselessStrEnum(_REDUCE_FUNCTIONS).tag(attr=True) + custom_reduce_fn = tl.Any(allow_none=True, default_value=None).tag(attr=True) _source_coordinates = tl.Instance(Coordinates) @@ -873,33 +863,6 @@ class GroupReduce(UnaryAlgorithm): def _default_coordinates_source(self): return self.source - def _get_source_coordinates(self, requested_coordinates): - # get available time coordinates - # TODO do these two checks during node initialization - available_coordinates = self.coordinates_source.find_coordinates() - if len(available_coordinates) != 1: - raise ValueError("Cannot evaluate this node; too many available coordinates") - avail_coords = available_coordinates[0] - if "time" not in avail_coords.udims: - raise ValueError("GroupReduce coordinates source node must be time-dependent") - - # intersect grouped time coordinates using groupby DatetimeAccessor - avail_time = xr.DataArray(avail_coords.coords["time"]) - eval_time = xr.DataArray(requested_coordinates.coords["time"]) - N = getattr(avail_time.dt, self.groupby) - E = getattr(eval_time.dt, self.groupby) - native_time_mask = np.in1d(N, E) - - # use requested spatial coordinates and filtered available times - coords = Coordinates( - time=avail_time.data[native_time_mask], - lat=requested_coordinates["lat"], - lon=requested_coordinates["lon"], - order=("time", "lat", "lon"), - ) - - return coords - @common_doc(COMMON_DOC) @node_eval def eval(self, coordinates, output=None): @@ -922,12 +885,7 @@ def eval(self, coordinates, output=None): If source it not time-depended (required by this node). """ - self._source_coordinates = self._get_source_coordinates(coordinates) - - if output is None: - output = self.create_output_array(coordinates) - - source_output = self.source.eval(self._source_coordinates) + source_output = self.source.eval(coordinates) # group grouped = source_output.groupby("time.%s" % self.groupby) @@ -939,14 +897,25 @@ def eval(self, coordinates, output=None): # standard, e.g. grouped.median('time') out = getattr(grouped, self.reduce_fn)("time") - # map - eval_time = xr.DataArray(coordinates.coords["time"]) - E = getattr(eval_time.dt, self.groupby) - out = out.sel(**{self.groupby: E}).rename({self.groupby: "time"}) - output[:] = out.transpose(*output.dims).data + out = out.rename({self.groupby: "time"}) + if output is None: + coords = podpac.coordinates.merge_dims( + [coordinates.drop("time"), Coordinates([out.coords["time"]], ["time"])] + ) + coords = coords.transpose(*out.dims) + output = self.create_output_array(coords, data=out.data) + else: + output.data[:] = out.data[:] + + ## map + # eval_time = xr.DataArray(coordinates.coords["time"]) + # E = getattr(eval_time.dt, self.groupby) + # out = out.sel(**{self.groupby: E}).rename({self.groupby: "time"}) + # output[:] = out.transpose(*output.dims).data return output + @property def base_ref(self): """ Default node reference/name in node definitions @@ -959,6 +928,100 @@ def base_ref(self): return "%s.%s.%s" % (self.source.base_ref, self.groupby, self.reduce_fn) +class ResampleReduce(UnaryAlgorithm): + """ + Resample a time-dependent source node using a statistical operation to achieve the result. + + Attributes + ---------- + custom_reduce_fn : function + required if reduce_fn is 'custom'. + resampleby : str + datetime sub-accessor. Currently 'dayofyear' is the enabled option. + reduce_fn : str + builtin xarray groupby reduce function, or 'custom'. + source : podpac.Node + Source node + """ + + _repr_keys = ["source", "resampleby", "reduce_fn"] + coordinates_source = NodeTrait(allow_none=True).tag(attr=True) + + # see https://github.com/pydata/xarray/blob/eeb109d9181c84dfb93356c5f14045d839ee64cb/xarray/core/accessors.py#L61 + resample = tl.Unicode().tag(attr=True) # could add season, month, etc + reduce_fn = tl.CaselessStrEnum(_REDUCE_FUNCTIONS).tag(attr=True) + custom_reduce_fn = tl.Any(allow_none=True, default_value=None).tag(attr=True) + + _source_coordinates = tl.Instance(Coordinates) + + @tl.default("coordinates_source") + def _default_coordinates_source(self): + return self.source + + @common_doc(COMMON_DOC) + @node_eval + def eval(self, coordinates, output=None): + """Evaluates this nodes using the supplied coordinates. + + Parameters + ---------- + coordinates : podpac.Coordinates + {requested_coordinates} + output : podpac.UnitsDataArray, optional + {eval_output} + + Returns + ------- + {eval_return} + + Raises + ------ + ValueError + If source it not time-depended (required by this node). + """ + + source_output = self.source.eval(coordinates) + + # group + grouped = source_output.resample(time=self.resample) + + # reduce + if self.reduce_fn == "custom": + out = grouped.reduce(self.custom_reduce_fn) + else: + # standard, e.g. grouped.median('time') + out = getattr(grouped, self.reduce_fn)() + + if output is None: + coords = podpac.coordinates.merge_dims( + [coordinates.drop("time"), Coordinates([out.coords["time"]], ["time"])] + ) + coords = coords.transpose(*out.dims) + output = self.create_output_array(coords, data=out.data) + else: + output.data[:] = out.data[:] + + ## map + # eval_time = xr.DataArray(coordinates.coords["time"]) + # E = getattr(eval_time.dt, self.groupby) + # out = out.sel(**{self.groupby: E}).rename({self.groupby: "time"}) + # output[:] = out.transpose(*output.dims).data + + return output + + @property + def base_ref(self): + """ + Default node reference/name in node definitions + + Returns + ------- + str + Default node reference/name in node definitions + """ + return "%s.%s.%s" % (self.source.base_ref, self.resample, self.reduce_fn) + + class DayOfYear(GroupReduce): """ Group a time-dependent source node by day of year and compute a statistic for each group. @@ -974,3 +1037,145 @@ class DayOfYear(GroupReduce): """ groupby = "dayofyear" + + +class DayOfYearWindow(Algorithm): + """ + This applies a function over a moving window around day-of-year in the requested coordinates. + It includes the ability to rescale the input/outputs. Note if, the input coordinates include multiple years, the + moving window will include all of the data inside the day-of-year window. + + Users need to implement the 'function' method. + + Attributes + ----------- + source: podpac.Node + The source node from which the statistics will be computed + window: int, optional + Default is 0. The size of the window over which to compute the distrubtion. This is always centered about the + day-of-year. The total number of days is always an odd number. For example, window=2 and window=3 will compute + the beta distribution for [x-1, x, x + 1] and report it as the result for x, where x is a day of the year. + scale_max: podpac.Node, optional + Default is None. A source dataset that can be used to scale the maximum value of the source function so that it + will fall between [0, 1]. If None, uses self.scale_float[0]. + scale_min: podpac.Node, optional + Default is None. A source dataset that can be used to scale the minimum value of the source function so that it + will fall between [0, 1]. If None, uses self.scale_float[1]. + scale_float: list, optional + Default is []. Floating point numbers used to scale the max [0] and min [1] of the source so that it falls + between [0, 1]. If scale_max or scale_min are defined, this property is ignored. If these are defined, the data + will be rescaled only if rescale=True below. + If None and scale_max/scale_min are not defined, the data is not scaled in any way. + rescale: bool, optional + Rescales the output data after being scaled from scale_float or scale_min/max + """ + + source = tl.Instance(podpac.Node).tag(attr=True) + window = tl.Int(0).tag(attr=True) + scale_max = tl.Instance(podpac.Node, default_value=None, allow_none=True).tag(attr=True) + scale_min = tl.Instance(podpac.Node, default_value=None, allow_none=True).tag(attr=True) + scale_float = tl.List(default_value=None, allow_none=True).tag(attr=True) + rescale = tl.Bool(False).tag(attr=True) + + def algorithm(self, inputs): + win = self.window // 2 + source = inputs["source"] + + # Scale the source to range [0, 1], required for the beta distribution + if "scale_max" in inputs: + scale_max = inputs["scale_max"] + elif self.scale_float and self.scale_float[1] is not None: + scale_max = self.scale_float[1] + else: + scale_max = None + + if "scale_min" in inputs: + scale_min = inputs["scale_min"] + elif self.scale_float and self.scale_float[0] is not None: + scale_min = self.scale_float[0] + else: + scale_min = None + + _log.debug("scale_min: {}\nscale_max: {}".format(scale_min, scale_max)) + if scale_min is not None and scale_max is not None: + source = (source.copy() - scale_min) / (scale_max - scale_min) + with np.errstate(invalid="ignore"): + source.data[(source.data < 0) | (source.data > 1)] = np.nan + + # Make the output coordinates with day-of-year as time + coords = xr.Dataset({"time": self._requested_coordinates["time"].coordinates}) + dsdoy = np.sort(np.unique(coords.time.dt.dayofyear)) + latlon_coords = self._requested_coordinates.drop("time") + time_coords = podpac.Coordinates([dsdoy], ["time"]) + coords = podpac.coordinates.merge_dims([latlon_coords, time_coords]) + coords = coords.transpose(*self._requested_coordinates.dims) + output = self.create_output_array(coords) + + # if all-nan input, no need to calculate + if np.all(np.isnan(source)): + return output + + # convert source time coords to day-of-year as well + sdoy = source.time.dt.dayofyear + + # loop over each day of year and compute window + for i, doy in enumerate(dsdoy): + _log.debug("Working on doy {doy} ({i}/{ld})".format(doy=doy, i=i + 1, ld=len(dsdoy))) + + # If either the start or end runs over the year, we need to do an OR on the bool index + # ----->s....<=e------ .in -out + # ..<=e----------->s.. + do_or = False + + start = doy - win + if start < 1: + start += 365 + do_or = True + + end = doy + win + if end > 365: + end -= 365 + do_or = True + + if do_or: + I = (sdoy >= start) | (sdoy <= end) + else: + I = (sdoy >= start) & (sdoy <= end) + + # Scipy's beta function doesn's support multi-dimensional arrays, so we have to loop over lat/lon/alt + lat_f = lon_f = alt_f = [None] + dims = ["lat", "lon", "alt"] + if "lat" in source.dims: + lat_f = source["lat"].data + if "lon" in source.dims: + lon_f = source["lon"].data + if "alt" in source.dims: + alt_f = source["alt"].data + + for alt in alt_f: + for lat in lat_f: + for lon in lon_f: + # _log.debug(f'lat, lon, alt = {lat}, {lon}, {alt}) + loc_dict = {k: v for k, v in zip(dims, [lat, lon, alt]) if v is not None} + + data = source.sel(time=I, **loc_dict).dropna("time").data + if np.all(np.isnan(data)): + continue + + # Fit function to the particular point + output.loc[loc_dict][{"time": i}] = self.function(data, output.loc[loc_dict][{"time": i}]) + + # Rescale the outputs + if self.rescale: + output = self.rescale_outputs(output, scale_max, scale_min) + return output + + def function(self, data, output): + raise NotImplementedError( + "Child classes need to implement this function. It is applied over the data and needs" + " to populate the output." + ) + + def rescale_outputs(self, output, scale_max, scale_min): + output = (output * (scale_max - scale_min)) + scale_min + return output diff --git a/podpac/core/algorithm/test/test_algorithm.py b/podpac/core/algorithm/test/test_algorithm.py index 63ec0b093..30c01250c 100644 --- a/podpac/core/algorithm/test/test_algorithm.py +++ b/podpac/core/algorithm/test/test_algorithm.py @@ -9,6 +9,7 @@ import xarray as xr import podpac +from podpac.core.utils import NodeTrait from podpac.core.node import Node, NodeException from podpac.core.data.array_source import Array from podpac.core.algorithm.utility import Arange @@ -23,43 +24,21 @@ def test_eval_not_implemented(self): with pytest.raises(NotImplementedError): node.eval(c) - def test_base_definition(self): - class MyAlgorithm(BaseAlgorithm): - x = tl.Instance(Node) - y = tl.Instance(Node) - z = tl.Unicode().tag(attr=True) - - node = MyAlgorithm(x=Arange(), y=Arange(), z="abcd") - - d = node.base_definition - assert isinstance(d, OrderedDict) - assert "node" in d - assert "attrs" in d - - # base (node, params) - assert d["node"] == "MyAlgorithm" - assert d["attrs"]["z"] == "abcd" - - # inputs - assert "inputs" in d - assert isinstance(d["inputs"], dict) - assert set(d["inputs"].keys()) == set(["x", "y"]) - def test_find_coordinates(self): class MyAlgorithm(BaseAlgorithm): - x = tl.Instance(Node) - y = tl.Instance(Node) + x = NodeTrait().tag(attr=True) + y = NodeTrait().tag(attr=True) node = MyAlgorithm( - x=Array(native_coordinates=podpac.Coordinates([[0, 1, 2], [10, 20]], dims=["lat", "lon"])), - y=Array(native_coordinates=podpac.Coordinates([[0, 1, 2], [110, 120]], dims=["lat", "lon"])), + x=Array(coordinates=podpac.Coordinates([[0, 1, 2], [10, 20]], dims=["lat", "lon"])), + y=Array(coordinates=podpac.Coordinates([[0, 1, 2], [110, 120]], dims=["lat", "lon"])), ) l = node.find_coordinates() assert isinstance(l, list) assert len(l) == 2 - assert node.x.native_coordinates in l - assert node.y.native_coordinates in l + assert node.x.coordinates in l + assert node.y.coordinates in l class TestAlgorithm(object): @@ -217,8 +196,8 @@ def algorithm(self, inputs): def test_multiple_outputs(self): class MyAlgorithm(Algorithm): - x = tl.Instance(Node) - y = tl.Instance(Node) + x = NodeTrait().tag(attr=True) + y = NodeTrait().tag(attr=True) outputs = ["sum", "prod", "diff"] def algorithm(self, inputs): @@ -229,7 +208,7 @@ def algorithm(self, inputs): coords = podpac.Coordinates([[0, 1, 2], [10, 20]], dims=["lat", "lon"]) x = Arange() - y = Array(source=np.full(coords.shape, 2), native_coordinates=coords) + y = Array(source=np.full(coords.shape, 2), coordinates=coords) xout = np.arange(6).reshape(3, 2) # all outputs @@ -249,7 +228,7 @@ def algorithm(self, inputs): class TestUnaryAlgorithm(object): - source = Array(native_coordinates=podpac.Coordinates([[0, 1, 2], [10, 20]], dims=["lat", "lon"])) + source = Array(coordinates=podpac.Coordinates([[0, 1, 2], [10, 20]], dims=["lat", "lon"])) def test_outputs(self): node = UnaryAlgorithm(source=self.source) diff --git a/podpac/core/algorithm/test/test_coord_select.py b/podpac/core/algorithm/test/test_coord_select.py index 7e68637a5..3a0b5d6c8 100644 --- a/podpac/core/algorithm/test/test_coord_select.py +++ b/podpac/core/algorithm/test/test_coord_select.py @@ -17,15 +17,14 @@ class MyDataSource(DataSource): - def get_native_coordinates(self): - return podpac.Coordinates( - [ - podpac.crange("2010-01-01", "2018-01-01", "4,h"), - podpac.clinspace(-180, 180, 6), - podpac.clinspace(-80, -70, 6), - ], - dims=["time", "lat", "lon"], - ) + coordinates = podpac.Coordinates( + [ + podpac.crange("2010-01-01", "2018-01-01", "4,h"), + podpac.clinspace(-180, 180, 6), + podpac.clinspace(-80, -70, 6), + ], + dims=["time", "lat", "lon"], + ) def get_data(self, coordinates, slc): node = Arange() @@ -67,7 +66,7 @@ def test_time_expansion_implicit_coordinates(self): o = node.eval(coords) def test_spatial_expansion_ultiple_outputs(self): - multi = Array(source=np.random.random(coords.shape + (2,)), native_coordinates=coords, outputs=["a", "b"]) + multi = Array(source=np.random.random(coords.shape + (2,)), coordinates=coords, outputs=["a", "b"]) node = ExpandCoordinates(source=multi, lat=(-1, 1, 0.1)) o = node.eval(coords) @@ -93,7 +92,7 @@ def test_time_selection_implicit_coordinates(self): o = node.eval(coords) def test_spatial_selection_multiple_outputs(self): - multi = Array(source=np.random.random(coords.shape + (2,)), native_coordinates=coords, outputs=["a", "b"]) + multi = Array(source=np.random.random(coords.shape + (2,)), coordinates=coords, outputs=["a", "b"]) node = SelectCoordinates(source=multi, lat=(46, 56, 1)) o = node.eval(coords) @@ -114,7 +113,7 @@ def test_year_substitution_orig_coords(self): def test_year_substitution_missing_coords(self): source = Array( source=[[1, 2, 3], [4, 5, 6]], - native_coordinates=podpac.Coordinates( + coordinates=podpac.Coordinates( [podpac.crange("2018-01-01", "2018-01-02", "1,D"), podpac.clinspace(45, 66, 3)], dims=["time", "lat"] ), ) @@ -126,7 +125,7 @@ def test_year_substitution_missing_coords(self): def test_year_substitution_missing_coords_orig_coords(self): source = Array( source=[[1, 2, 3], [4, 5, 6]], - native_coordinates=podpac.Coordinates( + coordinates=podpac.Coordinates( [podpac.crange("2018-01-01", "2018-01-02", "1,D"), podpac.clinspace(45, 66, 3)], dims=["time", "lat"] ), ) @@ -136,7 +135,7 @@ def test_year_substitution_missing_coords_orig_coords(self): assert o["time"].data == xr.DataArray(coords.coords["time"]).data def test_year_substitution_multiple_outputs(self): - multi = Array(source=np.random.random(coords.shape + (2,)), native_coordinates=coords, outputs=["a", "b"]) + multi = Array(source=np.random.random(coords.shape + (2,)), coordinates=coords, outputs=["a", "b"]) node = YearSubstituteCoordinates(source=multi, year="2018") o = node.eval(coords) assert o.time.dt.year.data[0] == 2018 diff --git a/podpac/core/algorithm/test/test_generic.py b/podpac/core/algorithm/test/test_generic.py index c01e53cbf..e1ef7230e 100644 --- a/podpac/core/algorithm/test/test_generic.py +++ b/podpac/core/algorithm/test/test_generic.py @@ -1,5 +1,6 @@ from __future__ import division, unicode_literals, print_function, absolute_import +import sys import warnings import pytest @@ -10,26 +11,27 @@ from podpac.core.algorithm.utility import Arange, SinCoords from podpac.core.algorithm.generic import GenericInputs, Arithmetic, Generic, Mask, Combine +if sys.version_info.major == 2: + from podpac.core.algorithm.generic import PermissionError + class TestGenericInputs(object): def test_init(self): node = GenericInputs(a=Arange(), b=SinCoords()) - assert "a" in node.inputs - assert "b" in node.inputs + assert node.inputs["a"] == Arange() + assert node.inputs["b"] == SinCoords() + + def test_base_definition(self): + node = GenericInputs(a=Arange(), b=SinCoords()) + d = node._base_definition + assert "inputs" in d + assert "a" in d["inputs"] + assert "b" in d["inputs"] def test_reserved_name(self): with pytest.raises(RuntimeError, match="Trait .* is reserved"): GenericInputs(style=SinCoords()) - def test_serialization(self): - node = GenericInputs(a=Arange(), b=SinCoords()) - d = node.definition - assert d[node.base_ref]["inputs"]["a"] in d - assert d[node.base_ref]["inputs"]["b"] in d - - node2 = node.from_definition(d) - assert node2.hash == node.hash - class TestArithmetic(object): def test_init(self): diff --git a/podpac/core/algorithm/test/test_signal.py b/podpac/core/algorithm/test/test_signal.py index 0781b452b..383620672 100644 --- a/podpac/core/algorithm/test/test_signal.py +++ b/podpac/core/algorithm/test/test_signal.py @@ -67,7 +67,7 @@ def test_eval_multiple_outputs(self): lon = clinspace(-80, 70, 40, name="lon") kernel = [[1, 2, 1]] coords = Coordinates([lat, lon]) - multi = Array(source=np.random.random(coords.shape + (2,)), native_coordinates=coords, outputs=["a", "b"]) + multi = Array(source=np.random.random(coords.shape + (2,)), coordinates=coords, outputs=["a", "b"]) node = Convolution(source=multi, kernel=kernel) o = node.eval(Coordinates([lat, lon])) @@ -78,7 +78,7 @@ def test_eval_nan(self): data = np.ones(coords.shape) data[10, 10] = np.nan - source = Array(source=data, native_coordinates=coords) + source = Array(source=data, coordinates=coords) node = Convolution(source=source, kernel=[[1, 2, 1]]) o = node.eval(coords[8:12, 7:13]) diff --git a/podpac/core/algorithm/test/test_stats.py b/podpac/core/algorithm/test/test_stats.py index c87ceca2b..056b0b869 100644 --- a/podpac/core/algorithm/test/test_stats.py +++ b/podpac/core/algorithm/test/test_stats.py @@ -4,13 +4,16 @@ import numpy as np import xarray as xr import scipy.stats +import traitlets as tl import podpac +from podpac.core.algorithm.utility import Arange from podpac.core.data.array_source import Array from podpac.core.algorithm.stats import Reduce from podpac.core.algorithm.stats import Min, Max, Sum, Count, Mean, Variance, Skew, Kurtosis, StandardDeviation +from podpac.core.algorithm.generic import Arithmetic from podpac.core.algorithm.stats import Median, Percentile -from podpac.core.algorithm.stats import GroupReduce, DayOfYear +from podpac.core.algorithm.stats import GroupReduce, DayOfYear, DayOfYearWindow def setup_module(): @@ -24,11 +27,11 @@ def setup_module(): a[3, 0, 0] = np.nan a[0, 3, 0] = np.nan a[0, 0, 3] = np.nan - source = Array(source=a, native_coordinates=coords) + source = Array(source=a, coordinates=coords) data = source.eval(coords) ab = np.stack([a, 2 * a], -1) - multisource = Array(source=ab, native_coordinates=coords, outputs=["a", "b"]) + multisource = Array(source=ab, coordinates=coords, outputs=["a", "b"]) bdata = 2 * data @@ -40,13 +43,13 @@ def test_auto_chunk(self): node = Min(source=source) with podpac.settings: - podpac.settings["CACHE_OUTPUT_DEFAULT"] = False + podpac.settings["CACHE_NODE_OUTPUT_DEFAULT"] = False podpac.settings["CHUNK_SIZE"] = "auto" node.eval(coords) def test_chunked_fallback(self): with podpac.settings: - podpac.settings["CACHE_OUTPUT_DEFAULT"] = False + podpac.settings["CACHE_NODE_OUTPUT_DEFAULT"] = False class First(Reduce): def reduce(self, x): @@ -72,7 +75,7 @@ class BaseTests(object): def test_full(self): with podpac.settings: - podpac.settings["CACHE_OUTPUT_DEFAULT"] = False + podpac.settings["CACHE_NODE_OUTPUT_DEFAULT"] = False podpac.settings["CHUNK_SIZE"] = None node = self.NodeClass(source=source) @@ -88,7 +91,7 @@ def test_full(self): def test_full_chunked(self): with podpac.settings: node = self.NodeClass(source=source, dims=coords.dims) - podpac.settings["CACHE_OUTPUT_DEFAULT"] = False + podpac.settings["CACHE_NODE_OUTPUT_DEFAULT"] = False podpac.settings["CHUNK_SIZE"] = 500 output = node.eval(coords) # xr.testing.assert_allclose(output, self.expected_full) @@ -96,7 +99,7 @@ def test_full_chunked(self): def test_lat_lon(self): with podpac.settings: - podpac.settings["CACHE_OUTPUT_DEFAULT"] = False + podpac.settings["CACHE_NODE_OUTPUT_DEFAULT"] = False podpac.settings["CHUNK_SIZE"] = None node = self.NodeClass(source=source, dims=["lat", "lon"]) output = node.eval(coords) @@ -105,7 +108,7 @@ def test_lat_lon(self): def test_lat_lon_chunked(self): with podpac.settings: - podpac.settings["CACHE_OUTPUT_DEFAULT"] = False + podpac.settings["CACHE_NODE_OUTPUT_DEFAULT"] = False podpac.settings["CHUNK_SIZE"] = 500 node = self.NodeClass(source=source, dims=["lat", "lon"]) output = node.eval(coords) @@ -114,7 +117,7 @@ def test_lat_lon_chunked(self): def test_time(self): with podpac.settings: - podpac.settings["CACHE_OUTPUT_DEFAULT"] = False + podpac.settings["CACHE_NODE_OUTPUT_DEFAULT"] = False podpac.settings["CHUNK_SIZE"] = None node = self.NodeClass(source=source, dims="time") output = node.eval(coords) @@ -123,7 +126,7 @@ def test_time(self): def test_time_chunked(self): with podpac.settings: - podpac.settings["CACHE_OUTPUT_DEFAULT"] = False + podpac.settings["CACHE_NODE_OUTPUT_DEFAULT"] = False podpac.settings["CHUNK_SIZE"] = 500 node = self.NodeClass(source=source, dims="time") output = node.eval(coords) @@ -132,7 +135,7 @@ def test_time_chunked(self): def test_multiple_outputs(self): with podpac.settings: - podpac.settings["CACHE_OUTPUT_DEFAULT"] = False + podpac.settings["CACHE_NODE_OUTPUT_DEFAULT"] = False podpac.settings["CHUNK_SIZE"] = None node = self.NodeClass(source=multisource, dims=["lat", "lon"]) output = node.eval(coords) @@ -265,5 +268,102 @@ class TestGroupReduce(object): pass +class TestResampleReduce(object): + pass + + class TestDayOfYear(object): pass + + +class F(DayOfYearWindow): + cache_output = tl.Bool(False) + cache_update = tl.Bool(True) + + def function(self, data, output): + return len(data) + + +class FM(DayOfYearWindow): + cache_output = tl.Bool(False) + cache_update = tl.Bool(True) + + def function(self, data, output): + return np.mean(data) + + +class TestDayOfYearWindow(object): + def test_doy_window1(self): + coords = podpac.coordinates.concat( + [ + podpac.Coordinates([podpac.crange("1999-12-29", "2000-01-02", "1,D", "time")]), + podpac.Coordinates([podpac.crange("2001-12-30", "2002-01-03", "1,D", "time")]), + ] + ) + + node = Arange() + nodedoywindow = F(source=node, window=1, cache_output=False, cache_update=True) + o = nodedoywindow.eval(coords) + + np.testing.assert_array_equal(o, [2, 2, 1, 1, 2, 2]) + + def test_doy_window2(self): + coords = podpac.coordinates.concat( + [ + podpac.Coordinates([podpac.crange("1999-12-29", "2000-01-03", "1,D", "time")]), + podpac.Coordinates([podpac.crange("2001-12-30", "2002-01-02", "1,D", "time")]), + ] + ) + + node = Arange() + nodedoywindow = F(source=node, window=2, cache_output=False, cache_update=True) + o = nodedoywindow.eval(coords) + + np.testing.assert_array_equal(o, [6, 5, 3, 3, 5, 6]) + + def test_doy_window2_mean_rescale_float(self): + coords = podpac.coordinates.concat( + [ + podpac.Coordinates([podpac.crange("1999-12-29", "2000-01-03", "1,D", "time")]), + podpac.Coordinates([podpac.crange("2001-12-30", "2002-01-02", "1,D", "time")]), + ] + ) + + node = Arange() + nodedoywindow = FM(source=node, window=2, cache_output=False, cache_update=True) + o = nodedoywindow.eval(coords) + + nodedoywindow_s = FM( + source=node, window=2, cache_output=False, cache_update=True, scale_float=[0, coords.size], rescale=True + ) + o_s = nodedoywindow_s.eval(coords) + + np.testing.assert_array_almost_equal(o, o_s) + + def test_doy_window2_mean_rescale_max_min(self): + with podpac.settings: + podpac.settings.set_unsafe_eval(True) + + coords = podpac.coordinates.concat( + [ + podpac.Coordinates([podpac.crange("1999-12-29", "2000-01-03", "1,D", "time")]), + podpac.Coordinates([podpac.crange("2001-12-30", "2002-01-02", "1,D", "time")]), + ] + ) + + node = Arange() + node_max = Arithmetic(source=node, eqn="(source < 5) + source") + node_min = Arithmetic(source=node, eqn="-1*(source < 5) + source") + + nodedoywindow_s = FM( + source=node, + window=2, + cache_output=False, + cache_update=True, + scale_max=node_max, + scale_min=node_min, + rescale=False, + ) + o_s = nodedoywindow_s.eval(coords) + + np.testing.assert_array_almost_equal([0.5] * o_s.size, o_s) diff --git a/podpac/core/algorithm/utility.py b/podpac/core/algorithm/utility.py index 82d1eb4c5..e9c7cab19 100644 --- a/podpac/core/algorithm/utility.py +++ b/podpac/core/algorithm/utility.py @@ -63,7 +63,7 @@ def algorithm(self, inputs): raise ValueError("Coordinate name not in evaluated coordinates") c = self._requested_coordinates[self.coord_name] - coords = Coordinates([c]) + coords = Coordinates([c], validate_crs=False) return self.create_output_array(coords, data=c.coordinates) diff --git a/podpac/core/authentication.py b/podpac/core/authentication.py index 8d38d7f4e..a9dd9ec28 100644 --- a/podpac/core/authentication.py +++ b/podpac/core/authentication.py @@ -1,171 +1,194 @@ """ -PODPAC Authentication +PODPAC Authentication """ -from __future__ import division, unicode_literals, print_function, absolute_import - - -import sys import getpass -import re - -# python 2/3 compatibility -if sys.version_info.major < 3: - input = raw_input -else: - from builtins import input - -# Optional PODPAC dependency -try: - import requests -except: - - class Dum(object): - def __init__(self, *args, **kwargs): - pass +import logging - requests = Dum() - requests.Session = Dum +import requests +import traitlets as tl +from lazy_import import lazy_module -from podpac.core import utils from podpac.core.settings import settings +from podpac.core.utils import cached_property +_log = logging.getLogger(__name__) -class Session(requests.Session): - """Base Class for authentication in PODPAC - - Attributes - ---------- - auth : tuple - (username, password) string in plain text - hostname : str - Host address (eg. http://example.com) that gets authenticated. - password : str - Password used for authentication. - Loaded from podpac settings file using password@:attr:`hostname` as the key. - username : str - Username used for authentication. - Loaded from podpac settings file using username@:attr:`hostname` as the key. - """ - - def __init__(self, hostname="", username=None, password=None): - # requests __init__ - super(Session, self).__init__() - - self.hostname = hostname - self.username = username - self.password = password - - # load username/password from settings - if self.username is None: - self.username = settings["username@" + self.hostname] - - if self.password is None: - self.password = settings["password@" + self.hostname] - - self.auth = (self.username, self.password) - - -class EarthDataSession(Session): - """ - Modified from: https://wiki.earthdata.nasa.gov/display/EL/How+To+Access+Data+With+Python - overriding requests.Session.rebuild_auth to maintain headers when redirected +def set_credentials(hostname, username=None, password=None): + """Set authentication credentials for a remote URL in the :class:`podpac.settings`. - Attributes + Parameters ---------- - product_url : str - Url to NSIDC product OpenDAP server - product_url_regex : str - Regex used to match redirected hostname if different from :attr:`self.hostname` + hostname : str + Hostname for `username` and `password`. + username : str, optional + Username to store in settings for `hostname`. + If no username is provided and the username does not already exist in the settings, + the user will be prompted to enter one. + password : str, optional + Password to store in settings for `hostname` + If no password is provided and the password does not already exist in the settings, + the user will be prompted to enter one. """ - # make sure attributes are persistent across all EarthDataSession classes - hostname = None - username = None - password = None - auth = tuple() + if hostname is None or hostname == "": + raise ValueError("`hostname` must be defined") - def __init__(self, product_url="", **kwargs): + # see whats stored in settings already + u_settings = settings.get("username@{}".format(hostname)) + p_settings = settings.get("password@{}".format(hostname)) - # override hostname with earthdata url - kwargs["hostname"] = "urs.earthdata.nasa.gov" + # get username from 1. function input 2. settings 3. python input() + u = username or u_settings or input("Username: ") + p = password or p_settings or getpass.getpass() - # Session init - super(EarthDataSession, self).__init__(**kwargs) + # set values in settings + settings["username@{}".format(hostname)] = u + settings["password@{}".format(hostname)] = p - # store product_url - self.product_url = product_url + _log.debug("Set credentials for hostname {}".format(hostname)) - # parse product_url for hostname - product_url_hostname = requests.utils.urlparse(self.product_url).hostname - # make all numbers in product_url_hostname wildcards - self.product_url_regex = ( - re.compile(re.sub(r"\d", r"\\d", product_url_hostname)) if product_url_hostname is not None else None - ) +class RequestsSessionMixin(tl.HasTraits): + hostname = tl.Unicode(allow_none=False) + auth_required = tl.Bool(default_value=False) - def rebuild_auth(self, prepared_request, response): + @property + def username(self): + """Returns username stored in settings for accessing `self.hostname`. + The username is stored under key `username@` + + Returns + ------- + str + username stored in settings for accessing `self.hostname` + + Raises + ------ + ValueError + Raises a ValueError if not username is stored in settings for `self.hostname` """ - Overrides from the library to keep headers when redirected to or from - the NASA auth host. + key = "username@{}".format(self.hostname) + username = settings.get(key) + if not username: + raise ValueError( + "No username found for hostname '{0}'. Use `{1}.set_credentials(username='', password='') to store credentials for this host".format( + self.hostname, self.__class__.__name__ + ) + ) + + return username + + @property + def password(self): + """Returns password stored in settings for accessing `self.hostname`. + The password is stored under key `password@` - Parameters - ---------- - prepared_request : requests.Request - Description - response : requests.Response - Description + Returns + ------- + str + password stored in settings for accessing `self.hostname` + + Raises + ------ + ValueError + Raises a ValueError if not password is stored in settings for `self.hostname` + """ + key = "password@{}".format(self.hostname) + password = settings.get(key) + if not password: + raise ValueError( + "No password found for hostname {0}. Use `{1}.set_credentials(username='', password='') to store credentials for this host".format( + self.hostname, self.__class__.__name__ + ) + ) + + return password + + @cached_property + def session(self): + """Requests Session object for making calls to remote `self.hostname` + See https://2.python-requests.org/en/master/api/#sessionapi Returns ------- - None - + :class:requests.Session + Requests Session class with `auth` attribute defined """ - headers = prepared_request.headers - url = prepared_request.url - - if "Authorization" in headers: - original_parsed = requests.utils.urlparse(response.request.url) - redirect_parsed = requests.utils.urlparse(url) - - # delete Authorization headers if original and redirect do not match - # is not in product_url_regex - if ( - (original_parsed.hostname != redirect_parsed.hostname) - and redirect_parsed.hostname != self.hostname - and original_parsed.hostname != self.hostname - ): - - # if redirect matches product_url_regex, then allow the headers to stay - if self.product_url_regex is not None and self.product_url_regex.match(redirect_parsed.hostname): - pass - else: - del headers["Authorization"] - - return - - def update_login(self, username=None, password=None): - """Summary + return self._create_session() + + def set_credentials(self, username=None, password=None): + """Shortcut to :func:`podpac.authentication.set_crendentials` using class member :attr:`self.hostname` for the hostname Parameters ---------- username : str, optional - Username input + Username to store in settings for `self.hostname`. + If no username is provided and the username does not already exist in the settings, + the user will be prompted to enter one. password : str, optional - Password input + Password to store in settings for `self.hostname` + If no password is provided and the password does not already exist in the settings, + the user will be prompted to enter one. """ - print("Updating login information for: ", self.hostname) - - if username is None: - username = input("Username: ") - - settings["username@" + self.hostname] = username - - if password is None: - password = getpass.getpass() - - settings["password@" + self.hostname] = password + return set_credentials(self.hostname, username=username, password=password) - self.auth = (username, password) + def _create_session(self): + """Creates a :class:`requests.Session` with username and password defined + + Returns + ------- + :class:`requests.Session` + """ + s = requests.Session() + + try: + s.auth = (self.username, self.password) + except ValueError as e: + if self.auth_required: + raise e + else: + _log.warning("No auth provided for session") + + return s + + +class S3Mixin(tl.HasTraits): + """ Mixin to add S3 credentials and access to a Node. """ + + anon = tl.Bool(False) + aws_access_key_id = tl.Unicode(allow_none=True) + aws_secret_access_key = tl.Unicode(allow_none=True) + aws_region_name = tl.Unicode(allow_none=True) + aws_client_kwargs = tl.Dict() + config_kwargs = tl.Dict() + + @tl.default("aws_access_key_id") + def _get_access_key_id(self): + return settings["AWS_ACCESS_KEY_ID"] + + @tl.default("aws_secret_access_key") + def _get_secret_access_key(self): + return settings["AWS_SECRET_ACCESS_KEY"] + + @tl.default("aws_region_name") + def _get_region_name(self): + return settings["AWS_REGION_NAME"] + + @cached_property + def s3(self): + # this has to be done here for multithreading to work + s3fs = lazy_module("s3fs") + + if self.anon: + return s3fs.S3FileSystem(anon=True, client_kwargs=self.aws_client_kwargs) + else: + return s3fs.S3FileSystem( + key=self.aws_access_key_id, + secret=self.aws_secret_access_key, + region_name=self.aws_region_name, + client_kwargs=self.aws_client_kwargs, + config_kwargs=self.config_kwargs, + ) diff --git a/podpac/core/cache/cache_ctrl.py b/podpac/core/cache/cache_ctrl.py index 11f872154..30b075e28 100644 --- a/podpac/core/cache/cache_ctrl.py +++ b/podpac/core/cache/cache_ctrl.py @@ -4,13 +4,19 @@ import podpac from podpac.core.settings import settings - from podpac.core.cache.utils import CacheWildCard, CacheException from podpac.core.cache.ram_cache_store import RamCacheStore from podpac.core.cache.disk_cache_store import DiskCacheStore from podpac.core.cache.s3_cache_store import S3CacheStore +_CACHE_STORES = {"ram": RamCacheStore, "disk": DiskCacheStore, "s3": S3CacheStore} + +_CACHE_NAMES = {RamCacheStore: "ram", DiskCacheStore: "disk", S3CacheStore: "s3"} + +_CACHE_MODES = ["ram", "disk", "network", "all"] + + def get_default_cache_ctrl(): """ Get the default CacheCtrl according to the settings. @@ -27,46 +33,39 @@ def get_default_cache_ctrl(): return make_cache_ctrl(settings["DEFAULT_CACHE"]) -def make_cache_ctrl(stores): +def make_cache_ctrl(names): """ Make a cache_ctrl from a list of cache store types. Arguments --------- - stores : str or list - cache store or stores, e.g. 'ram' or ['ram', 'disk']. + names : str or list + cache name or names, e.g. 'ram' or ['ram', 'disk']. Returns ------- ctrl : CacheCtrl - CachCtrl using the specified cache stores + CachCtrl using the specified cache names """ - if isinstance(stores, str): - stores = [stores] + if isinstance(names, six.string_types): + names = [names] - cache_stores = [] - for elem in stores: - if elem == "ram": - cache_stores.append(RamCacheStore()) - elif elem == "disk": - cache_stores.append(DiskCacheStore()) - elif elem == "s3": - cache_stores.append(S3CacheStore()) - else: - raise ValueError("Unknown cache store type '%s'" % elem) + for name in names: + if name not in _CACHE_STORES: + raise ValueError("Unknown cache store type '%s', options are %s" % (name, list(_CACHE_STORES))) - return CacheCtrl(cache_stores) + return CacheCtrl([_CACHE_STORES[name]() for name in names]) -def clear_cache(mode=None): +def clear_cache(mode="all"): """ Clear the entire default cache_ctrl. Arguments --------- mode : str - determines what types of the `CacheStore` are affected: 'ram','disk','network','all'. + determines what types of the `CacheStore` are affected. Options: 'ram', 'disk', 'network', 'all'. Default 'all'. """ cache_ctrl = get_default_cache_ctrl() @@ -88,20 +87,22 @@ def __init__(self, cache_stores=[]): Parameters ---------- cache_stores : list, optional - list of CacheStore objects to manage, in the order that they should be interogated. + list of CacheStore objects to manage, in the order that they should be interrogated. """ + self._cache_stores = cache_stores - self._cache_mode = None - def _get_cache_stores(self, mode): - if mode is None: - mode = self._cache_mode - if mode is None: - mode = "all" + def __repr__(self): + return "CacheCtrl(cache_stores=%s)" % self.cache_stores + + @property + def cache_stores(self): + return [_CACHE_NAMES[store.__class__] for store in self._cache_stores] + def _get_cache_stores_by_mode(self, mode="all"): return [c for c in self._cache_stores if mode in c.cache_modes] - def put(self, node, data, key, coordinates=None, mode=None, update=False): + def put(self, node, data, key, coordinates=None, mode="all", update=True): """Cache data for specified node. Parameters @@ -115,30 +116,30 @@ def put(self, node, data, key, coordinates=None, mode=None, update=False): coordinates : Coordinates, optional Coordinates for which cached object should be retrieved, for coordinate-dependent data such as evaluation output mode : str - determines what types of the `CacheStore` are affected: 'ram','disk','network','all'. Defaults to `node._cache_mode` or 'all'. Overriden by `self._cache_mode` if `self._cache_mode` is not `None`. + determines what types of the `CacheStore` are affected. Options: 'ram', 'disk', 'network', 'all'. Default 'all'. update : bool If True existing data in cache will be updated with `data`, If False, error will be thrown if attempting put something into the cache with the same node, key, coordinates of an existing entry. """ if not isinstance(node, podpac.Node): - raise TypeError("node must of type 'Node', not '%s'" % type(Node)) + raise TypeError("Invalid node (must be of type Node, not '%s')" % type(node)) if not isinstance(key, six.string_types): - raise TypeError("key must be a string type, not '%s'" % (type(key))) + raise TypeError("Invalid key (must be a string, not '%s')" % (type(key))) if not isinstance(coordinates, podpac.Coordinates) and coordinates is not None: - raise TypeError("coordinates must be of type 'Coordinates', not '%s'" % type(coordinates)) + raise TypeError("Invalid coordinates (must be of type 'Coordinates', not '%s')" % type(coordinates)) - if not isinstance(mode, six.string_types) and mode is not None: - raise TypeError("mode must be of type 'str', not '%s'" % type(mode)) + if mode not in _CACHE_MODES: + raise ValueError("Invalid mode (must be one of %s, not '%s')" % (_CACHE_MODES, mode)) if key == "*": - raise ValueError("key cannot be '*'") + raise ValueError("Invalid key ('*' is reserved)") - for c in self._get_cache_stores(mode): + for c in self._get_cache_stores_by_mode(mode): c.put(node=node, data=data, key=key, coordinates=coordinates, update=update) - def get(self, node, key, coordinates=None, mode=None): + def get(self, node, key, coordinates=None, mode="all"): """Get cached data for this node. Parameters @@ -150,7 +151,7 @@ def get(self, node, key, coordinates=None, mode=None): coordinates : Coordinates, optional Coordinates for which cached object should be retrieved, for coordinate-dependent data such as evaluation output mode : str - determines what types of the `CacheStore` are affected: 'ram','disk','network','all'. Defaults to `node._cache_mode` or 'all'. Overriden by `self._cache_mode` if `self._cache_mode` is not `None`. + determines what types of the `CacheStore` are affected. Options: 'ram', 'disk', 'network', 'all'. Default 'all'. Returns ------- @@ -164,26 +165,26 @@ def get(self, node, key, coordinates=None, mode=None): """ if not isinstance(node, podpac.Node): - raise TypeError("node must of type 'Node', not '%s'" % type(Node)) + raise TypeError("Invalid node (must be of type Node, not '%s')" % type(node)) if not isinstance(key, six.string_types): - raise TypeError("key must be a string type, not '%s'" % (type(key))) + raise TypeError("Invalid key (must be a string, not '%s')" % (type(key))) if not isinstance(coordinates, podpac.Coordinates) and coordinates is not None: - raise TypeError("coordinates must be of type 'Coordinates', not '%s'" % type(coordinates)) + raise TypeError("Invalid coordinates (must be of type 'Coordinates', not '%s')" % type(coordinates)) - if not isinstance(mode, six.string_types) and mode is not None: - raise TypeError("mode must be of type 'str', not '%s'" % type(mode)) + if mode not in _CACHE_MODES: + raise ValueError("Invalid mode (must be one of %s, not '%s')" % (_CACHE_MODES, mode)) if key == "*": - raise ValueError("key cannot be '*'") + raise ValueError("Invalid key ('*' is reserved)") - for c in self._get_cache_stores(mode): + for c in self._get_cache_stores_by_mode(mode): if c.has(node=node, key=key, coordinates=coordinates): return c.get(node=node, key=key, coordinates=coordinates) raise CacheException("Requested data is not in any cache stores.") - def has(self, node, key, coordinates=None, mode=None): + def has(self, node, key, coordinates=None, mode="all"): """Check for cached data for this node Parameters @@ -195,7 +196,7 @@ def has(self, node, key, coordinates=None, mode=None): coordinates: Coordinate, optional Coordinates for which cached object should be checked mode : str - determines what types of the `CacheStore` are affected: 'ram','disk','network','all'. Defaults to `node._cache_mode` or 'all'. Overriden by `self._cache_mode` if `self._cache_mode` is not `None`. + determines what types of the `CacheStore` are affected. Options: 'ram', 'disk', 'network', 'all'. Default 'all'. Returns ------- @@ -204,27 +205,27 @@ def has(self, node, key, coordinates=None, mode=None): """ if not isinstance(node, podpac.Node): - raise TypeError("node must of type 'Node', not '%s'" % type(Node)) + raise TypeError("Invalid node (must be of type Node, not '%s')" % type(node)) if not isinstance(key, six.string_types): - raise TypeError("key must be a string type, not '%s'" % (type(key))) + raise TypeError("Invalid key (must be a string, not '%s')" % (type(key))) if not isinstance(coordinates, podpac.Coordinates) and coordinates is not None: - raise TypeError("coordinates must be of type 'Coordinates', not '%s'" % type(coordinates)) + raise TypeError("Invalid coordinates (must be of type 'Coordinates', not '%s')" % type(coordinates)) - if not isinstance(mode, six.string_types) and mode is not None: - raise TypeError("mode must be of type 'str', not '%s'" % type(mode)) + if mode not in _CACHE_MODES: + raise ValueError("Invalid mode (must be one of %s, not '%s')" % (_CACHE_MODES, mode)) if key == "*": - raise ValueError("key cannot be '*'") + raise ValueError("Invalid key ('*' is reserved)") - for c in self._get_cache_stores(mode): + for c in self._get_cache_stores_by_mode(mode): if c.has(node=node, key=key, coordinates=coordinates): return True return False - def rem(self, node, key, coordinates=None, mode=None): + def rem(self, node, key, coordinates=None, mode="all"): """Delete cached data for this node. Parameters @@ -236,20 +237,20 @@ def rem(self, node, key, coordinates=None, mode=None): coordinates : Coordinates, str Delete only cached objects for these coordinates. Use `'*'` to match all coordinates. mode : str - determines what types of the `CacheStore` are affected: 'ram','disk','network','all'. Defaults to `node._cache_mode` or 'all'. Overriden by `self._cache_mode` if `self._cache_mode` is not `None`. + determines what types of the `CacheStore` are affected. Options: 'ram', 'disk', 'network', 'all'. Default 'all'. """ if not isinstance(node, podpac.Node): - raise TypeError("node must of type 'Node', not '%s'" % type(podpac.Node)) + raise TypeError("Invalid node (must be of type Node, not '%s')" % type(node)) if not isinstance(key, six.string_types): - raise TypeError("key must be a string type, not '%s'" % (type(key))) + raise TypeError("Invalid key (must be a string, not '%s')" % (type(key))) if not isinstance(coordinates, podpac.Coordinates) and coordinates is not None and coordinates != "*": - raise TypeError("coordinates must be '*' or of type 'Coordinates' not '%s'" % type(coordinates)) + raise TypeError("Invalid coordinates (must be '*' or of type 'Coordinates', not '%s')" % type(coordinates)) - if not isinstance(mode, six.string_types) and mode is not None: - raise TypeError("mode must be of type 'str', not '%s'" % type(mode)) + if mode not in _CACHE_MODES: + raise ValueError("Invalid mode (must be one of %s, not '%s')" % (_CACHE_MODES, mode)) if key == "*": key = CacheWildCard() @@ -257,21 +258,21 @@ def rem(self, node, key, coordinates=None, mode=None): if coordinates == "*": coordinates = CacheWildCard() - for c in self._get_cache_stores(mode): + for c in self._get_cache_stores_by_mode(mode): c.rem(node=node, key=key, coordinates=coordinates) - def clear(self, mode=None): + def clear(self, mode="all"): """ Clear all cached data. Parameters ------------ mode : str - determines what types of the `CacheStore` are affected: 'ram','disk','network','all'. Defaults to `node._cache_mode` or 'all'. Overriden by `self._cache_mode` if `self._cache_mode` is not `None`. + determines what types of the `CacheStore` are affected. Options: 'ram', 'disk', 'network', 'all'. Default 'all'. """ - if not isinstance(mode, six.string_types) and mode is not None: - raise TypeError("mode must be of type 'str', not '%s'" % type(mode)) + if mode not in _CACHE_MODES: + raise ValueError("Invalid mode (must be one of %s, not '%s')" % (_CACHE_MODES, mode)) - for c in self._get_cache_stores(mode): + for c in self._get_cache_stores_by_mode(mode): c.clear() diff --git a/podpac/core/cache/cache_store.py b/podpac/core/cache/cache_store.py index 8d23fe752..d5f35405b 100644 --- a/podpac/core/cache/cache_store.py +++ b/podpac/core/cache/cache_store.py @@ -29,7 +29,7 @@ def size(self): raise NotImplementedError - def put(self, node, data, key, coordinates=None, update=False): + def put(self, node, data, key, coordinates=None, update=True): """Cache data for specified node. Parameters diff --git a/podpac/core/cache/disk_cache_store.py b/podpac/core/cache/disk_cache_store.py index 78bce95da..fce0e7ecb 100644 --- a/podpac/core/cache/disk_cache_store.py +++ b/podpac/core/cache/disk_cache_store.py @@ -23,10 +23,7 @@ def __init__(self): if not settings["DISK_CACHE_ENABLED"]: raise CacheException("Disk cache is disabled in the podpac settings.") - if os.path.isabs(settings["DISK_CACHE_DIR"]): - self._root_dir_path = settings["DISK_CACHE_DIR"] - else: - self._root_dir_path = os.path.join(settings["ROOT_PATH"], settings["DISK_CACHE_DIR"]) + self._root_dir_path = settings.cache_path # ----------------------------------------------------------------------------------------------------------------- # public cache API diff --git a/podpac/core/cache/file_cache_store.py b/podpac/core/cache/file_cache_store.py index 4eb55234c..680a871b4 100644 --- a/podpac/core/cache/file_cache_store.py +++ b/podpac/core/cache/file_cache_store.py @@ -36,7 +36,7 @@ class FileCacheStore(CacheStore): # public cache API methods # ----------------------------------------------------------------------------------------------------------------- - def put(self, node, data, key, coordinates=None, update=False): + def put(self, node, data, key, coordinates=None, update=True): """Cache data for specified node. Parameters @@ -54,10 +54,10 @@ def put(self, node, data, key, coordinates=None, update=False): """ # check for existing entry - if self.has(node, key, coordinates): - if not update: - raise CacheException("Cache entry already exists. Use `update=True` to overwrite.") - self.rem(node, key, coordinates) + if not update and self.has(node, key, coordinates): + raise CacheException("Cache entry already exists. Use `update=True` to overwrite.") + + self.rem(node, key, coordinates) # serialize path_root = self._path_join(self._get_node_dir(node), self._get_filename(node, key, coordinates)) @@ -88,7 +88,7 @@ def put(self, node, data, key, coordinates=None, update=False): else: warnings.warn( "Object of type '%s' is not json serializable; caching object to file using pickle, which " - "may not be compatible with other Python versions or podpac versions." + "may not be compatible with other Python versions or podpac versions." % type(data) ) path = path_root + ".pkl" s = pickle.dumps(data) diff --git a/podpac/core/cache/ram_cache_store.py b/podpac/core/cache/ram_cache_store.py index 9c884f954..85fbfae4a 100644 --- a/podpac/core/cache/ram_cache_store.py +++ b/podpac/core/cache/ram_cache_store.py @@ -55,7 +55,7 @@ def size(self): process = psutil.Process(os.getpid()) return process.memory_info().rss # this is actually the total size of the process - def put(self, node, data, key, coordinates=None, update=False): + def put(self, node, data, key, coordinates=None, update=True): """Cache data for specified node. Parameters @@ -77,9 +77,10 @@ def put(self, node, data, key, coordinates=None, update=False): full_key = self._get_full_key(node, key, coordinates) - if full_key in _thread_local.cache: - if not update: - raise CacheException("Cache entry already exists. Use update=True to overwrite.") + if not update and full_key in _thread_local.cache: + raise CacheException("Cache entry already exists. Use update=True to overwrite.") + + self.rem(node, key, coordinates) if self.max_size is not None and self.size >= self.max_size: # # TODO removal policy diff --git a/podpac/core/cache/test/test_cache_ctrl.py b/podpac/core/cache/test/test_cache_ctrl.py index 17f8abe89..048fb1cd6 100644 --- a/podpac/core/cache/test/test_cache_ctrl.py +++ b/podpac/core/cache/test/test_cache_ctrl.py @@ -9,18 +9,245 @@ from podpac.core.cache.cache_ctrl import get_default_cache_ctrl, make_cache_ctrl, clear_cache +class CacheCtrlTestNode(podpac.Node): + pass + + +NODE = CacheCtrlTestNode() + + class TestCacheCtrl(object): - def test_init(self): + def test_init_default(self): + ctrl = CacheCtrl() + assert len(ctrl._cache_stores) == 0 + assert ctrl.cache_stores == [] + repr(ctrl) + + def test_init_list(self): + ctrl = CacheCtrl(cache_stores=[]) + assert len(ctrl._cache_stores) == 0 + assert ctrl.cache_stores == [] + repr(ctrl) + + ctrl = CacheCtrl(cache_stores=[RamCacheStore()]) + assert len(ctrl._cache_stores) == 1 + assert isinstance(ctrl._cache_stores[0], RamCacheStore) + assert ctrl.cache_stores == ["ram"] + repr(ctrl) + + ctrl = CacheCtrl(cache_stores=[RamCacheStore(), DiskCacheStore()]) + assert len(ctrl._cache_stores) == 2 + assert isinstance(ctrl._cache_stores[0], RamCacheStore) + assert isinstance(ctrl._cache_stores[1], DiskCacheStore) + assert ctrl.cache_stores == ["ram", "disk"] + repr(ctrl) + + def test_put_has_get(self): + ctrl = CacheCtrl(cache_stores=[RamCacheStore(), DiskCacheStore()]) + ctrl.clear() + + # has False + assert not ctrl._cache_stores[0].has(NODE, "key") + assert not ctrl._cache_stores[1].has(NODE, "key") + assert not ctrl.has(NODE, "key") + + # put + ctrl.put(NODE, 10, "key") + + # has True + assert ctrl._cache_stores[0].has(NODE, "key") + assert ctrl._cache_stores[1].has(NODE, "key") + assert ctrl.has(NODE, "key") + + # get value + assert ctrl._cache_stores[0].get(NODE, "key") == 10 + assert ctrl._cache_stores[1].get(NODE, "key") == 10 + assert ctrl.get(NODE, "key") == 10 + + def test_partial_has_get(self): + ctrl = CacheCtrl(cache_stores=[RamCacheStore(), DiskCacheStore()]) + ctrl.clear() + + # has False + assert not ctrl._cache_stores[0].has(NODE, "key") + assert not ctrl._cache_stores[1].has(NODE, "key") + assert not ctrl.has(NODE, "key") + + # put only in disk + ctrl._cache_stores[1].put(NODE, 10, "key") + + # has + assert not ctrl._cache_stores[0].has(NODE, "key") + assert ctrl._cache_stores[1].has(NODE, "key") + assert ctrl.has(NODE, "key") + + # get + with pytest.raises(CacheException, match="Cache miss"): + ctrl._cache_stores[0].get(NODE, "key") + assert ctrl._cache_stores[1].get(NODE, "key") == 10 + assert ctrl.get(NODE, "key") == 10 + + def test_get_cache_miss(self): + ctrl = CacheCtrl(cache_stores=[RamCacheStore(), DiskCacheStore()]) + ctrl.clear() + + with pytest.raises(CacheException, match="Requested data is not in any cache stores"): + ctrl.get(NODE, "key") + + def test_put_rem(self): + ctrl = CacheCtrl(cache_stores=[RamCacheStore(), DiskCacheStore()]) + + # put and check has + ctrl.put(NODE, 10, "key") + assert ctrl.has(NODE, "key") + + # rem other and check has + ctrl.rem(NODE, "other") + assert ctrl.has(NODE, "key") + + # rem and check has + ctrl.rem(NODE, "key") + assert not ctrl.has(NODE, "key") + + def test_rem_wildcard_key(self): + ctrl = CacheCtrl(cache_stores=[RamCacheStore(), DiskCacheStore()]) + + # put and check has + ctrl.put(NODE, 10, "key") + assert ctrl.has(NODE, "key") + + # rem other and check has + ctrl.rem(NODE, key="*") + assert not ctrl.has(NODE, "key") + + def test_rem_wildcard_coordinates(self): pass + def test_put_clear(self): + ctrl = CacheCtrl(cache_stores=[RamCacheStore(), DiskCacheStore()]) -def test_get_default_cache_ctrl(): - ctrl = get_default_cache_ctrl() + # put and check has + ctrl.put(NODE, 10, "key") + assert ctrl.has(NODE, "key") + + # clear and check has + ctrl.clear() + + # check has + assert not ctrl.has(NODE, "key") + + def test_put_has_mode(self): + ctrl = CacheCtrl(cache_stores=[RamCacheStore(), DiskCacheStore()]) + + # put disk and check has + ctrl.clear() + assert not ctrl.has(NODE, "key") + + ctrl.put(NODE, 10, "key", mode="disk") + assert not ctrl._cache_stores[0].has(NODE, "key") + assert not ctrl.has(NODE, "key", mode="ram") + assert ctrl._cache_stores[1].has(NODE, "key") + assert ctrl.has(NODE, "key", mode="disk") + assert ctrl.has(NODE, "key") + + # put ram and check has + ctrl.clear() + assert not ctrl.has(NODE, "key") + + ctrl.put(NODE, 10, "key", mode="ram") + assert ctrl._cache_stores[0].has(NODE, "key") + assert ctrl.has(NODE, "key", mode="ram") + assert not ctrl._cache_stores[1].has(NODE, "key") + assert not ctrl.has(NODE, "key", mode="disk") + assert ctrl.has(NODE, "key") + + def test_invalid_node(self): + ctrl = CacheCtrl(cache_stores=[RamCacheStore(), DiskCacheStore()]) - assert isinstance(ctrl, CacheCtrl) - assert ctrl._cache_stores == [] + # type + with pytest.raises(TypeError, match="Invalid node"): + ctrl.put("node", 10, "key") + with pytest.raises(TypeError, match="Invalid node"): + ctrl.get("node", "key") + + with pytest.raises(TypeError, match="Invalid node"): + ctrl.has("node", "key") + + with pytest.raises(TypeError, match="Invalid node"): + ctrl.rem("node", "key") + + def test_invalid_key(self): + ctrl = CacheCtrl(cache_stores=[RamCacheStore(), DiskCacheStore()]) + + # type + with pytest.raises(TypeError, match="Invalid key"): + ctrl.put(NODE, 10, 10) + + with pytest.raises(TypeError, match="Invalid key"): + ctrl.get(NODE, 10) + + with pytest.raises(TypeError, match="Invalid key"): + ctrl.has(NODE, 10) + + with pytest.raises(TypeError, match="Invalid key"): + ctrl.rem(NODE, 10) + + # wildcard + with pytest.raises(ValueError, match="Invalid key"): + ctrl.put(NODE, 10, "*") + + with pytest.raises(ValueError, match="Invalid key"): + ctrl.get(NODE, "*") + + with pytest.raises(ValueError, match="Invalid key"): + ctrl.has(NODE, "*") + + # allowed + ctrl.rem(NODE, "*") + + def test_invalid_coordinates(self): + ctrl = CacheCtrl(cache_stores=[RamCacheStore(), DiskCacheStore()]) + + # type + with pytest.raises(TypeError, match="Invalid coordinates"): + ctrl.put(NODE, 10, "key", coordinates="coords") + + with pytest.raises(TypeError, match="Invalid coordinates"): + ctrl.get(NODE, "key", coordinates="coords") + + with pytest.raises(TypeError, match="Invalid coordinates"): + ctrl.has(NODE, "key", coordinates="coords") + + with pytest.raises(TypeError, match="Invalid coordinates"): + ctrl.rem(NODE, "key", coordinates="coords") + + def test_invalid_mode(self): + ctrl = CacheCtrl(cache_stores=[RamCacheStore(), DiskCacheStore()]) + + with pytest.raises(ValueError, match="Invalid mode"): + ctrl.put(NODE, 10, "key", mode="other") + + with pytest.raises(ValueError, match="Invalid mode"): + ctrl.get(NODE, "key", mode="other") + + with pytest.raises(ValueError, match="Invalid mode"): + ctrl.has(NODE, "key", mode="other") + + with pytest.raises(ValueError, match="Invalid mode"): + ctrl.rem(NODE, "key", mode="other") + + with pytest.raises(ValueError, match="Invalid mode"): + ctrl.clear(mode="other") + + +def test_get_default_cache_ctrl(): with podpac.settings: + podpac.settings["DEFAULT_CACHE"] = [] + ctrl = get_default_cache_ctrl() + assert isinstance(ctrl, CacheCtrl) + assert ctrl._cache_stores == [] + podpac.settings["DEFAULT_CACHE"] = ["ram"] ctrl = get_default_cache_ctrl() assert isinstance(ctrl, CacheCtrl) @@ -28,35 +255,47 @@ def test_get_default_cache_ctrl(): assert isinstance(ctrl._cache_stores[0], RamCacheStore) -def test_make_cache_ctrl(): - ctrl = make_cache_ctrl("ram") - assert isinstance(ctrl, CacheCtrl) - assert len(ctrl._cache_stores) == 1 - assert isinstance(ctrl._cache_stores[0], RamCacheStore) +class TestMakeCacheCtrl(object): + def test_str(self): + ctrl = make_cache_ctrl("ram") + assert isinstance(ctrl, CacheCtrl) + assert len(ctrl._cache_stores) == 1 + assert isinstance(ctrl._cache_stores[0], RamCacheStore) - ctrl = make_cache_ctrl("disk") - assert len(ctrl._cache_stores) == 1 - assert isinstance(ctrl._cache_stores[0], DiskCacheStore) + ctrl = make_cache_ctrl("disk") + assert len(ctrl._cache_stores) == 1 + assert isinstance(ctrl._cache_stores[0], DiskCacheStore) - ctrl = make_cache_ctrl(["ram", "disk"]) - assert len(ctrl._cache_stores) == 2 - assert isinstance(ctrl._cache_stores[0], RamCacheStore) - assert isinstance(ctrl._cache_stores[1], DiskCacheStore) + def test_list(self): + ctrl = make_cache_ctrl(["ram", "disk"]) + assert len(ctrl._cache_stores) == 2 + assert isinstance(ctrl._cache_stores[0], RamCacheStore) + assert isinstance(ctrl._cache_stores[1], DiskCacheStore) - with pytest.raises(ValueError, match="Unknown cache store type"): - ctrl = make_cache_ctrl("other") + ctrl = make_cache_ctrl(["ram", "disk"]) + assert len(ctrl._cache_stores) == 2 + assert isinstance(ctrl._cache_stores[0], RamCacheStore) + assert isinstance(ctrl._cache_stores[1], DiskCacheStore) + def test_invalid(self): + with pytest.raises(ValueError, match="Unknown cache store type"): + ctrl = make_cache_ctrl("other") -def test_clear_cache(): - with podpac.settings: - # make a default cache - podpac.settings["DEFAULT_CACHE"] = ["ram"] + with pytest.raises(ValueError, match="Unknown cache store type"): + ctrl = make_cache_ctrl(["other"]) + + +class TestClearCache(object): + def test_clear_cache(self): + with podpac.settings: + # make a default cache + podpac.settings["DEFAULT_CACHE"] = ["ram"] - # fill the default cache - node = podpac.algorithm.Arange() - node.put_cache(0, "mykey") - assert node.has_cache("mykey") + # fill the default cache + node = podpac.algorithm.Arange() + node.put_cache(0, "mykey") + assert node.has_cache("mykey") - clear_cache() + clear_cache() - assert not node.has_cache("mykey") + assert not node.has_cache("mykey") diff --git a/podpac/core/cache/test/test_cache_stores.py b/podpac/core/cache/test/test_cache_stores.py index 5b0b0d13e..cbd1c0dd7 100644 --- a/podpac/core/cache/test/test_cache_stores.py +++ b/podpac/core/cache/test/test_cache_stores.py @@ -1,13 +1,13 @@ -import numpy as np import os import shutil import copy +import tempfile import pytest import xarray as xr +import numpy as np import podpac - from podpac.core.cache.utils import CacheException from podpac.core.cache.ram_cache_store import RamCacheStore from podpac.core.cache.disk_cache_store import DiskCacheStore @@ -15,7 +15,7 @@ COORDS1 = podpac.Coordinates([[0, 1, 2], [10, 20, 30, 40], ["2018-01-01", "2018-01-02"]], dims=["lat", "lon", "time"]) COORDS2 = podpac.Coordinates([[0, 1, 2], [10, 20, 30]], dims=["lat", "lon"]) -NODE1 = podpac.data.Array(source=np.ones(COORDS1.shape), native_coordinates=COORDS1) +NODE1 = podpac.data.Array(source=np.ones(COORDS1.shape), coordinates=COORDS1) NODE2 = podpac.algorithm.Arange() @@ -87,11 +87,11 @@ def test_update(self): # raise exception and do not change with pytest.raises(CacheException, match="Cache entry already exists."): - store.put(NODE1, 10, "mykey1") + store.put(NODE1, 10, "mykey1", update=False) assert store.get(NODE1, "mykey1") == 10 # update - store.put(NODE1, 20, "mykey1", update=True) + store.put(NODE1, 20, "mykey1") assert store.get(NODE1, "mykey1") == 20 def test_get_put_none(self): @@ -314,23 +314,22 @@ class TestDiskCacheStore(FileCacheStoreTests): Store = DiskCacheStore enabled_setting = "DISK_CACHE_ENABLED" limit_setting = "DISK_CACHE_MAX_BYTES" - cache_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "tmp_cache")) def setup_method(self): super(TestDiskCacheStore, self).setup_method() - podpac.settings["DISK_CACHE_DIR"] = self.cache_dir - assert not os.path.exists(self.cache_dir) + self.test_cache_dir = tempfile.mkdtemp(prefix="podpac-test-") + podpac.settings["DISK_CACHE_DIR"] = self.test_cache_dir def teardown_method(self): super(TestDiskCacheStore, self).teardown_method() - shutil.rmtree(self.cache_dir, ignore_errors=True) + shutil.rmtree(self.test_cache_dir, ignore_errors=True) def test_cache_dir(self): # absolute path - podpac.settings["DISK_CACHE_DIR"] = self.cache_dir - expected = self.cache_dir + podpac.settings["DISK_CACHE_DIR"] = self.test_cache_dir + expected = self.test_cache_dir store = DiskCacheStore() store.put(NODE1, 10, "mykey1") assert store.find(NODE1, "mykey1").startswith(expected) @@ -338,7 +337,9 @@ def test_cache_dir(self): # relative path podpac.settings["DISK_CACHE_DIR"] = "_testcache_" - expected = os.path.join(podpac.settings["ROOT_PATH"], "_testcache_") + expected = os.path.join( + os.environ.get("XDG_CACHE_HOME", os.path.join(os.path.expanduser("~"), ".config", "podpac")), "_testcache_" + ) store = DiskCacheStore() store.clear() store.put(NODE1, 10, "mykey1") @@ -363,17 +364,17 @@ class TestS3CacheStore(FileCacheStoreTests): Store = S3CacheStore enabled_setting = "S3_CACHE_ENABLED" limit_setting = "S3_CACHE_MAX_BYTES" - cache_dir = "tmp_cache" + test_cache_dir = "tmp_cache" def setup_method(self): super(TestS3CacheStore, self).setup_method() - podpac.settings["S3_CACHE_DIR"] = self.cache_dir + podpac.settings["S3_CACHE_DIR"] = self.test_cache_dir def teardown_method(self): try: store = S3CacheStore() - store._rmtree(self.cache_dir) + store._rmtree(self.test_cache_dir) except: pass diff --git a/podpac/core/cache/test/tmp_cache/podpac/core/node/Test/Test--podpac-cached-property-blue_242a08f6c2e12f7113898cea3ab5003f_64b03983688fcfb5e0e653280ea0b679_None.json b/podpac/core/cache/test/tmp_cache/podpac/core/node/Test/Test--podpac-cached-property-blue_242a08f6c2e12f7113898cea3ab5003f_64b03983688fcfb5e0e653280ea0b679_None.json new file mode 100644 index 000000000..e68122571 --- /dev/null +++ b/podpac/core/cache/test/tmp_cache/podpac/core/node/Test/Test--podpac-cached-property-blue_242a08f6c2e12f7113898cea3ab5003f_64b03983688fcfb5e0e653280ea0b679_None.json @@ -0,0 +1 @@ +"blue" \ No newline at end of file diff --git a/podpac/core/cache/test/tmp_cache/wcs_temp.tiff b/podpac/core/cache/test/tmp_cache/wcs_temp.tiff new file mode 100644 index 000000000..9396ba2d1 Binary files /dev/null and b/podpac/core/cache/test/tmp_cache/wcs_temp.tiff differ diff --git a/podpac/core/compositor.py b/podpac/core/compositor.py deleted file mode 100644 index 0cc51c971..000000000 --- a/podpac/core/compositor.py +++ /dev/null @@ -1,388 +0,0 @@ -""" -Compositor Summary -""" - - -from __future__ import division, unicode_literals, print_function, absolute_import - -import copy - -import numpy as np -import traitlets as tl - -# Internal imports -from podpac.core.settings import settings -from podpac.core.coordinates import Coordinates, merge_dims -from podpac.core.utils import common_doc, ArrayTrait, trait_is_defined -from podpac.core.units import UnitsDataArray -from podpac.core.node import COMMON_NODE_DOC, node_eval, Node -from podpac.core.data.datasource import COMMON_DATA_DOC -from podpac.core.data.interpolation import interpolation_trait -from podpac.core.managers.multi_threading import thread_manager - -COMMON_COMPOSITOR_DOC = COMMON_DATA_DOC.copy() # superset of COMMON_NODE_DOC - - -@common_doc(COMMON_COMPOSITOR_DOC) -class Compositor(Node): - """Compositor - - Attributes - ---------- - cache_native_coordinates : Bool - Default is True. If native_coordinates are requested by the user, it may take a long time to calculate if the - Compositor points to many sources. The result is relatively small and is cached by default. Caching may not be - desired if the datasource change or is updated. - interpolation : str, dict, optional - {interpolation} - is_source_coordinates_complete : Bool - Default is False. The source_coordinates do not have to completely describe the source. For example, the source - coordinates could include the year-month-day of the source, but the actual source also has hour-minute-second - information. In that case, source_coordinates is incomplete. This flag is used to automatically construct - native_coordinates. - shared_coordinates : :class:`podpac.Coordinates`, optional - Coordinates that are shared amongst all of the composited sources - source : str - The source is used for a unique name to cache composited products. - source_coordinates : :class:`podpac.Coordinates` - Coordinates that make each source unique. Much be single-dimensional the same size as ``sources``. Optional. - sources : :class:`np.ndarray` - An array of sources. This is a numpy array as opposed to a list so that boolean indexing may be used to - subselect the nodes that will be evaluated. - source_coordinates : :class:`podpac.Coordinates`, optional - Coordinates that make each source unique. This is used for subsetting which sources to evaluate based on the - user-requested coordinates. It is an optimization. - strict_source_outputs : bool - Default is False. When compositing multi-output sources, combine the outputs from all sources. If True, do not - allow sources with different outputs (an exception will be raised if the sources contain different outputs). - - Notes - ----- - Developers of new Compositor nodes need to implement the `composite` method. - - Multitheading:: - * When MULTITHREADING is False, the compositor stops evaluated sources once the output is completely filled. - * When MULTITHREADING is True, the compositor must evaluate every source. - The result is the same, but note that because of this, disabling multithreading could sometimes be faster, - especially if the number of threads is low. - * NASA data servers seem to have a hard limit of 10 simultaneous requests, so a max of 10 threads is recommend - for most use-cases. - """ - - shared_coordinates = tl.Instance(Coordinates, allow_none=True) - source_coordinates = tl.Instance(Coordinates, allow_none=True) - is_source_coordinates_complete = tl.Bool( - False, - help=( - "This allows some optimizations but assumes that a node's " - "native_coordinates=source_coordinate + shared_coordinate " - "IN THAT ORDER" - ), - ) - - sources = ArrayTrait(ndim=1) - cache_native_coordinates = tl.Bool(True) - interpolation = interpolation_trait(default_value=None) - strict_source_outputs = tl.Bool(False) - - @tl.default("source_coordinates") - def _source_coordinates_default(self): - return self.get_source_coordinates() - - @tl.validate("sources") - def _validate_sources(self, d): - self.outputs # check for consistent outputs - return np.array([copy.deepcopy(source) for source in d["value"]]) - - @tl.default("outputs") - def _default_outputs(self): - if all(source.outputs is None for source in self.sources): - return None - - elif all(source.outputs is not None and source.output is None for source in self.sources): - if self.strict_source_outputs: - outputs = self.sources[0].outputs - if any(source.outputs != outputs for source in self.sources): - raise ValueError( - "Source outputs mismatch, and strict_source_outputs is True. " - "The sources must all contain the same outputs if strict_source_outputs is True. " - ) - return outputs - else: - outputs = [] - for source in self.sources: - for output in source.outputs: - if output not in outputs: - outputs.append(output) - if len(outputs) == 0: - outputs = None - return outputs - - else: - raise ValueError( - "Cannot composite standard sources with multi-output sources. " - "The sources must all be stardard single-output nodes or all multi-output nodes." - ) - - @tl.validate("source_coordinates") - def _validate_source_coordinates(self, d): - if d["value"] is not None: - if d["value"].ndim != 1: - raise ValueError("Invalid source_coordinates, invalid ndim (%d != 1)" % d["value"].ndim) - - if d["value"].size != self.sources.size: - raise ValueError( - "Invalid source_coordinates, source and source_coordinates size mismatch (%d != %d)" - % (d["value"].size, self.sources.size) - ) - - return d["value"] - - # default representation - def __repr__(self): - source_name = str(self.__class__.__name__) - - rep = "{}".format(source_name) - rep += "\n\tsource: {}".format("_".join(str(source) for source in self.sources[:3])) - rep += "\n\tinterpolation: {}".format(self.interpolation) - - return rep - - def get_source_coordinates(self): - """ - Returns the coordinates describing each source. - This may be implemented by derived classes, and is an optimization that allows evaluation subsets of source. - - Returns - ------- - :class:`podpac.Coordinates` - Coordinates describing each source. - """ - return None - - @tl.default("shared_coordinates") - def _shared_coordinates_default(self): - return self.get_shared_coordinates() - - def get_shared_coordinates(self): - """Coordinates shared by each source. - - Raises - ------ - NotImplementedError - Description - """ - raise NotImplementedError() - - def select_sources(self, coordinates): - """Downselect compositor sources based on requested coordinates. - - This is used during the :meth:`eval` process as an optimization - when :attr:`source_coordinates` are not pre-defined. - - Parameters - ---------- - coordinates : :class:`podpac.Coordinates` - Coordinates to evaluate at compositor sources - - Returns - ------- - :class:`np.ndarray` - Array of downselected sources - """ - - # if source coordinates are defined, use intersect - if self.source_coordinates is not None: - # intersecting sources only - try: - _, I = self.source_coordinates.intersect(coordinates, outer=True, return_indices=True) - - except: # Likely non-monotonic coordinates - _, I = self.source_coordinates.intersect(coordinates, outer=False, return_indices=True) - i = I[0] - src_subset = self.sources[i] - - # no downselection possible - get all sources compositor - else: - src_subset = self.sources - - return src_subset - - def composite(self, coordinates, outputs, result=None): - """Implements the rules for compositing multiple sources together. - - Parameters - ---------- - outputs : list - A list of outputs that need to be composited together - result : UnitDataArray, optional - An optional pre-filled array may be supplied, otherwise the output will be allocated. - - Raises - ------ - NotImplementedError - """ - raise NotImplementedError() - - def iteroutputs(self, coordinates): - """Summary - - Parameters - ---------- - coordinates : :class:`podpac.Coordinates` - Coordinates to evaluate at compositor sources - - Yields - ------ - :class:`podpac.core.units.UnitsDataArray` - Output from source node eval method - """ - # downselect sources based on coordinates - src_subset = self.select_sources(coordinates) - - if len(src_subset) == 0: - yield self.create_output_array(coordinates) - return - - # Set the interpolation properties for sources - if self.interpolation is not None: - for s in src_subset.ravel(): - if trait_is_defined(self, "interpolation"): - s.set_trait("interpolation", self.interpolation) - - # Optimization: if coordinates complete and source coords is 1D, - # set native_coordinates unless they are set already - # WARNING: this assumes - # native_coords = source_coords + shared_coordinates - # NOT native_coords = shared_coords + source_coords - if self.is_source_coordinates_complete and self.source_coordinates.ndim == 1: - coords_subset = list(self.source_coordinates.intersect(coordinates, outer=True).coords.values())[0] - coords_dim = list(self.source_coordinates.dims)[0] - for s, c in zip(src_subset, coords_subset): - nc = merge_dims([Coordinates(np.atleast_1d(c), dims=[coords_dim]), self.shared_coordinates]) - - if trait_is_defined(s, "native_coordinates") is False: - s.set_trait("native_coordinates", nc) - - if settings["MULTITHREADING"]: - n_threads = thread_manager.request_n_threads(len(src_subset)) - if n_threads == 1: - thread_manager.release_n_threads(n_threads) - else: - n_threads = 0 - - if settings["MULTITHREADING"] and n_threads > 1: - # evaluate nodes in parallel using thread pool - self._multi_threaded = True - pool = thread_manager.get_thread_pool(processes=n_threads) - outputs = pool.map(lambda src: src.eval(coordinates), src_subset) - pool.close() - thread_manager.release_n_threads(n_threads) - for output in outputs: - yield output - - else: - # evaluate nodes serially - self._multi_threaded = False - for src in src_subset: - yield src.eval(coordinates) - - @node_eval - @common_doc(COMMON_COMPOSITOR_DOC) - def eval(self, coordinates, output=None): - """Evaluates this nodes using the supplied coordinates. - - Parameters - ---------- - coordinates : :class:`podpac.Coordinates` - {requested_coordinates} - output : podpac.UnitsDataArray, optional - {eval_output} - - Returns - ------- - {eval_return} - """ - - self._requested_coordinates = coordinates - - outputs = self.iteroutputs(coordinates) - output = self.composite(coordinates, outputs, output) - return output - - def find_coordinates(self): - """ - Get the available native coordinates for the Node. - - Returns - ------- - coords_list : list - list of available coordinates (Coordinate objects) - """ - - raise NotImplementedError("TODO") - - @property - @common_doc(COMMON_COMPOSITOR_DOC) - def base_definition(self): - """Base node defintion for Compositor nodes. - - Returns - ------- - {definition_return} - """ - d = super(Compositor, self).base_definition - d["sources"] = self.sources - d["interpolation"] = self.interpolation - return d - - -class OrderedCompositor(Compositor): - """Compositor that combines sources based on their order in self.sources. Once a request contains no - nans, the result is returned. - """ - - @common_doc(COMMON_COMPOSITOR_DOC) - def composite(self, coordinates, data_arrays, result=None): - """Composites data_arrays in order that they appear. - - Parameters - ---------- - coordinates : :class:`podpac.Coordinates` - {requested_coordinates} - data_arrays : generator - Generator that gives UnitDataArray's with the source values. - result : podpac.UnitsDataArray, optional - {eval_output} - - Returns - ------- - {eval_return} This composites the sources together until there are no nans or no more sources. - """ - - if result is None: - result = self.create_output_array(coordinates) - else: - result[:] = np.nan - - mask = UnitsDataArray.create(coordinates, outputs=self.outputs, data=0, dtype=bool) - for data in data_arrays: - if self.outputs is None: - data = data.transpose(*result.dims) - self._composite(result, data, mask) - else: - for name in data["output"]: - self._composite(result.sel(output=name), data.sel(output=name), mask.sel(output=name)) - - # stop if the results are full - if np.all(mask): - break - - return result - - @staticmethod - def _composite(result, data, mask): - source_mask = np.isfinite(data.data) - b = ~mask & source_mask - result.data[b.data] = data.data[b.data] - mask |= source_mask diff --git a/podpac/core/compositor/__init__.py b/podpac/core/compositor/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/podpac/core/compositor/compositor.py b/podpac/core/compositor/compositor.py new file mode 100644 index 000000000..8c25eb10a --- /dev/null +++ b/podpac/core/compositor/compositor.py @@ -0,0 +1,262 @@ +""" +Compositor Summary +""" + + +from __future__ import division, unicode_literals, print_function, absolute_import + +import copy + +import numpy as np +import traitlets as tl + +# Internal imports +from podpac.core.settings import settings +from podpac.core.coordinates import Coordinates +from podpac.core.utils import common_doc, NodeTrait +from podpac.core.node import COMMON_NODE_DOC, node_eval, Node +from podpac.core.data.datasource import COMMON_DATA_DOC +from podpac.core.interpolation.interpolation import InterpolationTrait +from podpac.core.managers.multi_threading import thread_manager + +COMMON_COMPOSITOR_DOC = COMMON_DATA_DOC.copy() # superset of COMMON_NODE_DOC + + +@common_doc(COMMON_COMPOSITOR_DOC) +class BaseCompositor(Node): + """A base class for compositor nodes. + + Attributes + ---------- + sources : list + Source nodes. + source_coordinates : :class:`podpac.Coordinates` + Coordinates that make each source unique. Must the same size as ``sources`` and single-dimensional. Optional. + interpolation : str, dict, optional + {interpolation} + + Notes + ----- + Developers of compositor subclasses nodes need to implement the `composite` method. + + Multitheading:: + * When MULTITHREADING is False, the compositor stops evaluated sources once the output is completely filled. + * When MULTITHREADING is True, the compositor must evaluate every source. + The result is the same, but note that because of this, disabling multithreading could sometimes be faster, + especially if the number of threads is low. + * NASA data servers seem to have a hard limit of 10 simultaneous requests, so a max of 10 threads is recommend + for most use-cases. + """ + + sources = tl.List(trait=NodeTrait()).tag(attr=True) + interpolation = InterpolationTrait(allow_none=True, default_value=None).tag(attr=True) + source_coordinates = tl.Instance(Coordinates, allow_none=True, default_value=None).tag(attr=True) + + auto_outputs = tl.Bool(False) + + # debug traits + _eval_sources = tl.Any() + + @tl.validate("sources") + def _validate_sources(self, d): + sources = d["value"] + + n = np.sum([source.outputs is None for source in sources]) + if not (n == 0 or n == len(sources)): + raise ValueError( + "Cannot composite standard sources with multi-output sources. " + "The sources must all be standard single-output nodes or all multi-output nodes." + ) + + # copy so that interpolation trait of the input source is not overwritten + return [copy.deepcopy(source) for source in sources] + + @tl.validate("source_coordinates") + def _validate_source_coordinates(self, d): + if d["value"] is None: + return None + + if d["value"].ndim != 1: + raise ValueError("Invalid source_coordinates, invalid ndim (%d != 1)" % d["value"].ndim) + + if d["value"].size != len(self.sources): + raise ValueError( + "Invalid source_coordinates, source and source_coordinates size mismatch (%d != %d)" + % (d["value"].size, len(self.sources)) + ) + + return d["value"] + + @tl.default("outputs") + def _default_outputs(self): + if not self.auto_outputs: + return None + + # autodetect outputs from sources + if all(source.outputs is None for source in self.sources): + outputs = None + + elif all(source.outputs is not None and source.output is None for source in self.sources): + outputs = [] + for source in self.sources: + for output in source.outputs: + if output not in outputs: + outputs.append(output) + + if len(outputs) == 0: + outputs = None + + else: + raise RuntimeError( + "Compositor sources were not validated correctly. " + "Cannot composite standard sources with multi-output sources." + ) + + return outputs + + def select_sources(self, coordinates): + """Select and prepare sources based on requested coordinates. + + Parameters + ---------- + coordinates : :class:`podpac.Coordinates` + Coordinates to evaluate at compositor sources + + Returns + ------- + sources : :class:`np.ndarray` + Array of sources + + Notes + ----- + * If :attr:`source_coordinates` is defined, only sources that intersect the requested coordinates are selected. + * Sets sources :attr:`interpolation`. + """ + + # select intersecting sources, if possible + if self.source_coordinates is None: + sources = self.sources + else: + try: + _, I = self.source_coordinates.intersect(coordinates, outer=True, return_indices=True) + except: + # Likely non-monotonic coordinates + _, I = self.source_coordinates.intersect(coordinates, outer=False, return_indices=True) + i = I[0] + sources = np.array(self.sources)[i].tolist() + + # set the interpolation properties for sources + if self.trait_is_defined("interpolation") and self.interpolation is not None: + for s in sources: + if s.has_trait("interpolation"): + s.set_trait("interpolation", self.interpolation) + + return sources + + def composite(self, coordinates, data_arrays, result=None): + """Implements the rules for compositing multiple sources together. Must be implemented by child classes. + + Parameters + ---------- + coordinates : :class:`podpac.Coordinates` + {requested_coordinates} + data_arrays : list + Evaluated data from the sources. + result : UnitDataArray, optional + An optional pre-filled array may be supplied, otherwise the output will be allocated. + + Returns + ------- + {eval_return} + """ + + raise NotImplementedError() + + def iteroutputs(self, coordinates): + """Summary + + Parameters + ---------- + coordinates : :class:`podpac.Coordinates` + Coordinates to evaluate at compositor sources + + Yields + ------ + :class:`podpac.core.units.UnitsDataArray` + Output from source node eval method + """ + + # get sources, potentially downselected + sources = self.select_sources(coordinates) + + if settings["DEBUG"]: + self._eval_sources = sources + + if len(sources) == 0: + yield self.create_output_array(coordinates) + return + + if settings["MULTITHREADING"]: + n_threads = thread_manager.request_n_threads(len(sources)) + if n_threads == 1: + thread_manager.release_n_threads(n_threads) + else: + n_threads = 0 + + if settings["MULTITHREADING"] and n_threads > 1: + # evaluate nodes in parallel using thread pool + self._multi_threaded = True + pool = thread_manager.get_thread_pool(processes=n_threads) + outputs = pool.map(lambda src: src.eval(coordinates), sources) + pool.close() + thread_manager.release_n_threads(n_threads) + for output in outputs: + yield output + + else: + # evaluate nodes serially + self._multi_threaded = False + for src in sources: + yield src.eval(coordinates) + + @node_eval + @common_doc(COMMON_COMPOSITOR_DOC) + def eval(self, coordinates, output=None): + """Evaluates this nodes using the supplied coordinates. + + Parameters + ---------- + coordinates : :class:`podpac.Coordinates` + {requested_coordinates} + output : podpac.UnitsDataArray, optional + {eval_output} + + Returns + ------- + {eval_return} + """ + + self._requested_coordinates = coordinates + outputs = self.iteroutputs(coordinates) + output = self.composite(coordinates, outputs, output) + return output + + def find_coordinates(self): + """ + Get the available coordinates for the Node. + + Returns + ------- + coords_list : list + available coordinates from all of the sources. + """ + + return [coords for source in self.sources for coords in source.find_coordinates()] + + @property + def _repr_keys(self): + """list of attribute names, used by __repr__ and __str__ to display minimal info about the node""" + keys = [] + if self.trait_is_defined("sources"): + keys.append("sources") + return keys diff --git a/podpac/core/compositor/ordered_compositor.py b/podpac/core/compositor/ordered_compositor.py new file mode 100644 index 000000000..83449a764 --- /dev/null +++ b/podpac/core/compositor/ordered_compositor.py @@ -0,0 +1,70 @@ +from __future__ import division, unicode_literals, print_function, absolute_import + +import numpy as np + +# Internal imports +from podpac.core.units import UnitsDataArray +from podpac.core.utils import common_doc +from podpac.core.compositor.compositor import COMMON_COMPOSITOR_DOC, BaseCompositor + + +@common_doc(COMMON_COMPOSITOR_DOC) +class OrderedCompositor(BaseCompositor): + """Compositor that combines sources based on their order in self.sources. + + The requested data is interpolated by the sources before being composited. + + Attributes + ---------- + sources : list + Source nodes, in order of preference. Later sources are only used where earlier sources do not provide data. + source_coordinates : :class:`podpac.Coordinates` + Coordinates that make each source unique. Must the same size as ``sources`` and single-dimensional. Optional. + interpolation : str, dict, optional + {interpolation} + """ + + @common_doc(COMMON_COMPOSITOR_DOC) + def composite(self, coordinates, data_arrays, result=None): + """Composites data_arrays in order that they appear. Once a request contains no nans, the result is returned. + + Parameters + ---------- + coordinates : :class:`podpac.Coordinates` + {requested_coordinates} + data_arrays : generator + Evaluated source data, in the same order as the sources. + result : podpac.UnitsDataArray, optional + {eval_output} + + Returns + ------- + {eval_return} This composites the sources together until there are no nans or no more sources. + """ + + if result is None: + result = self.create_output_array(coordinates) + else: + result[:] = np.nan + + mask = UnitsDataArray.create(coordinates, outputs=self.outputs, data=0, dtype=bool) + for data in data_arrays: + if self.outputs is None: + data = data.transpose(*result.dims) + self._composite(result, data, mask) + else: + for name in data["output"]: + self._composite(result.sel(output=name), data.sel(output=name), mask.sel(output=name)) + + # stop if the results are full + if np.all(mask): + break + + return result + + @staticmethod + def _composite(result, data, mask): + source_mask = np.isfinite(data.data) + b = ~mask & source_mask + result.data[b.data] = data.data[b.data] + mask |= source_mask diff --git a/podpac/core/compositor/test/test_base_compositor.py b/podpac/core/compositor/test/test_base_compositor.py new file mode 100644 index 000000000..385999450 --- /dev/null +++ b/podpac/core/compositor/test/test_base_compositor.py @@ -0,0 +1,243 @@ +import pytest +import numpy as np + +import podpac +from podpac.core.data.datasource import DataSource +from podpac.core.data.array_source import Array +from podpac.core.compositor.compositor import BaseCompositor + +COORDS = podpac.Coordinates( + [podpac.clinspace(45, 0, 16), podpac.clinspace(-70, -65, 16), podpac.clinspace(0, 1, 2)], + dims=["lat", "lon", "time"], +) +LON, LAT, TIME = np.meshgrid(COORDS["lon"].coordinates, COORDS["lat"].coordinates, COORDS["time"].coordinates) + +ARRAY_LAT = Array(source=LAT.astype(float), coordinates=COORDS, interpolation="bilinear") +ARRAY_LON = Array(source=LON.astype(float), coordinates=COORDS, interpolation="bilinear") +ARRAY_TIME = Array(source=TIME.astype(float), coordinates=COORDS, interpolation="bilinear") + +MULTI_0_XY = Array(source=np.full(COORDS.shape + (2,), 0), coordinates=COORDS, outputs=["x", "y"]) +MULTI_1_XY = Array(source=np.full(COORDS.shape + (2,), 1), coordinates=COORDS, outputs=["x", "y"]) +MULTI_4_YX = Array(source=np.full(COORDS.shape + (2,), 4), coordinates=COORDS, outputs=["y", "x"]) +MULTI_2_X = Array(source=np.full(COORDS.shape + (1,), 2), coordinates=COORDS, outputs=["x"]) +MULTI_3_Z = Array(source=np.full(COORDS.shape + (1,), 3), coordinates=COORDS, outputs=["z"]) + + +class TestBaseCompositor(object): + def test_init(self): + node = BaseCompositor(sources=[ARRAY_LAT, ARRAY_LON, ARRAY_TIME]) + repr(node) + + def test_source_coordinates(self): + # none (default) + node = BaseCompositor(sources=[ARRAY_LAT, ARRAY_LON, ARRAY_TIME]) + assert node.source_coordinates is None + + # unstacked + node = BaseCompositor( + sources=[podpac.algorithm.Arange(), podpac.algorithm.SinCoords()], + source_coordinates=podpac.Coordinates([[0, 1]], dims=["time"]), + ) + + # stacked + node = BaseCompositor( + sources=[podpac.algorithm.Arange(), podpac.algorithm.SinCoords()], + source_coordinates=podpac.Coordinates([[[0, 1], [10, 20]]], dims=["time_alt"]), + ) + + # invalid size + with pytest.raises(ValueError, match="Invalid source_coordinates, source and source_coordinates size mismatch"): + node = BaseCompositor( + sources=[podpac.algorithm.Arange(), podpac.algorithm.SinCoords()], + source_coordinates=podpac.Coordinates([[0, 1, 2]], dims=["time"]), + ) + + with pytest.raises(ValueError, match="Invalid source_coordinates, source and source_coordinates size mismatch"): + node = BaseCompositor( + sources=[podpac.algorithm.Arange(), podpac.algorithm.SinCoords()], + source_coordinates=podpac.Coordinates([[0, 1, 2]], dims=["time"]), + ) + + # invalid ndims + with pytest.raises(ValueError, match="Invalid source_coordinates"): + node = BaseCompositor( + sources=[podpac.algorithm.Arange(), podpac.algorithm.SinCoords()], + source_coordinates=podpac.Coordinates([[0, 1], [10, 20]], dims=["time", "alt"]), + ) + + def test_select_sources_default(self): + node = BaseCompositor( + sources=[DataSource(), DataSource(interpolation="nearest_preview"), podpac.algorithm.Arange()], + interpolation="bilinear", + ) + sources = node.select_sources(podpac.Coordinates([[0, 10]], ["time"])) + + assert isinstance(sources, list) + assert len(sources) == 3 + + def test_select_sources_intersection(self): + source_coords = podpac.Coordinates([[0, 10]], ["time"]) + node = BaseCompositor(sources=[DataSource(), DataSource()], source_coordinates=source_coords) + + # select all + selected = node.select_sources(source_coords) + assert len(selected) == 2 + assert selected[0] == node.sources[0] + assert selected[1] == node.sources[1] + + # select first + c = podpac.Coordinates([podpac.clinspace(0, 1, 10), podpac.clinspace(0, 1, 11), 0], ["lat", "lon", "time"]) + selected = node.select_sources(c) + assert len(selected) == 1 + assert selected[0] == node.sources[0] + + # select second + c = podpac.Coordinates([podpac.clinspace(0, 1, 10), podpac.clinspace(0, 1, 11), 10], ["lat", "lon", "time"]) + selected = node.select_sources(c) + assert len(selected) == 1 + assert selected[0] == node.sources[1] + + # select none + c = podpac.Coordinates([podpac.clinspace(0, 1, 10), podpac.clinspace(0, 1, 11), 100], ["lat", "lon", "time"]) + selected = node.select_sources(c) + assert len(selected) == 0 + + def test_select_sources_set_interpolation(self): + node = BaseCompositor(sources=[ARRAY_LAT, ARRAY_LON, ARRAY_TIME], interpolation="nearest") + sources = node.select_sources(COORDS) + assert sources[0].interpolation == "nearest" + assert sources[1].interpolation == "nearest" + assert sources[2].interpolation == "nearest" + assert ARRAY_LAT.interpolation == "bilinear" + assert ARRAY_LON.interpolation == "bilinear" + assert ARRAY_TIME.interpolation == "bilinear" + + # if no interpolation is provided, keep the source interpolation values + node = BaseCompositor(sources=[ARRAY_LAT, ARRAY_LON, ARRAY_TIME]) + sources = node.select_sources(COORDS) + assert node.sources[0].interpolation == "bilinear" + assert node.sources[1].interpolation == "bilinear" + assert node.sources[2].interpolation == "bilinear" + + def test_iteroutputs_empty(self): + node = BaseCompositor(sources=[ARRAY_LAT, ARRAY_LON, ARRAY_TIME]) + outputs = node.iteroutputs(podpac.Coordinates([-1, -1, -1], dims=["lat", "lon", "time"])) + np.testing.assert_array_equal(next(outputs), [[[np.nan]]]) + np.testing.assert_array_equal(next(outputs), [[[np.nan]]]) + np.testing.assert_array_equal(next(outputs), [[[np.nan]]]) + with pytest.raises(StopIteration): + next(outputs) + + def test_iteroutputs_singlethreaded(self): + with podpac.settings: + podpac.settings["MULTITHREADING"] = False + + node = BaseCompositor(sources=[ARRAY_LAT, ARRAY_LON, ARRAY_TIME]) + outputs = node.iteroutputs(COORDS) + np.testing.assert_array_equal(next(outputs), LAT) + np.testing.assert_array_equal(next(outputs), LON) + np.testing.assert_array_equal(next(outputs), TIME) + with pytest.raises(StopIteration): + next(outputs) + assert node._multi_threaded == False + + def test_iteroutputs_multithreaded(self): + with podpac.settings: + podpac.settings["MULTITHREADING"] = True + podpac.settings["N_THREADS"] = 8 + + n_threads_before = podpac.core.managers.multi_threading.thread_manager._n_threads_used + node = BaseCompositor(sources=[ARRAY_LAT, ARRAY_LON, ARRAY_TIME]) + outputs = node.iteroutputs(COORDS) + np.testing.assert_array_equal(next(outputs), LAT) + np.testing.assert_array_equal(next(outputs), LON) + np.testing.assert_array_equal(next(outputs), TIME) + with pytest.raises(StopIteration): + next(outputs) + assert node._multi_threaded == True + assert podpac.core.managers.multi_threading.thread_manager._n_threads_used == n_threads_before + + def test_iteroutputs_n_threads_1(self): + with podpac.settings: + podpac.settings["MULTITHREADING"] = True + podpac.settings["N_THREADS"] = 1 + + n_threads_before = podpac.core.managers.multi_threading.thread_manager._n_threads_used + node = BaseCompositor(sources=[ARRAY_LAT, ARRAY_LON, ARRAY_TIME]) + outputs = node.iteroutputs(COORDS) + np.testing.assert_array_equal(next(outputs), LAT) + np.testing.assert_array_equal(next(outputs), LON) + np.testing.assert_array_equal(next(outputs), TIME) + with pytest.raises(StopIteration): + next(outputs) + assert node._multi_threaded == False + assert podpac.core.managers.multi_threading.thread_manager._n_threads_used == n_threads_before + + def test_composite(self): + node = BaseCompositor(sources=[ARRAY_LAT, ARRAY_LON, ARRAY_TIME]) + with pytest.raises(NotImplementedError): + node.composite(COORDS, iter(())) + + def test_eval(self): + node = BaseCompositor(sources=[ARRAY_LAT, ARRAY_LON, ARRAY_TIME]) + with pytest.raises(NotImplementedError): + node.eval(COORDS) + + class MockComposite(BaseCompositor): + def composite(self, coordinates, outputs, result=None): + return next(outputs) + + node = MockComposite(sources=[ARRAY_LAT, ARRAY_LON, ARRAY_TIME]) + output = node.eval(COORDS) + np.testing.assert_array_equal(output, LAT) + + def test_find_coordinates(self): + node = BaseCompositor(sources=[ARRAY_LAT, ARRAY_LON, ARRAY_TIME]) + + coord_list = node.find_coordinates() + assert isinstance(coord_list, list) + assert len(coord_list) == 3 + + def test_outputs(self): + # standard single-output + node = BaseCompositor(sources=[ARRAY_LAT, ARRAY_LON, ARRAY_TIME]) + assert node.outputs is None + + # even if the sources have multiple outputs, the default here is outputs + node = BaseCompositor(sources=[MULTI_0_XY, MULTI_1_XY]) + assert node.outputs is None + + def test_auto_outputs(self): + # autodetect single-output + node = BaseCompositor(sources=[ARRAY_LAT, ARRAY_LON, ARRAY_TIME], auto_outputs=True) + assert node.outputs is None + + # autodetect multi-output + node = BaseCompositor(sources=[MULTI_0_XY, MULTI_1_XY], auto_outputs=True) + assert node.outputs == ["x", "y"] + + node = BaseCompositor(sources=[MULTI_0_XY, MULTI_3_Z], auto_outputs=True) + assert node.outputs == ["x", "y", "z"] + + node = BaseCompositor(sources=[MULTI_3_Z, MULTI_0_XY], auto_outputs=True) + assert node.outputs == ["z", "x", "y"] + + node = BaseCompositor(sources=[MULTI_0_XY, MULTI_4_YX], auto_outputs=True) + assert node.outputs == ["x", "y"] + + # mixed + with pytest.raises(ValueError, match="Cannot composite standard sources with multi-output sources."): + node = BaseCompositor(sources=[MULTI_2_X, ARRAY_LAT], auto_outputs=True) + + # no sources + node = BaseCompositor(sources=[], auto_outputs=True) + assert node.outputs is None + + def test_forced_invalid_sources(self): + class MyCompositor(BaseCompositor): + sources = [MULTI_2_X, ARRAY_LAT] + auto_outputs = True + + node = MyCompositor() + with pytest.raises(RuntimeError, match="Compositor sources were not validated correctly"): + node.outputs diff --git a/podpac/core/compositor/test/test_ordered_compositor.py b/podpac/core/compositor/test/test_ordered_compositor.py new file mode 100644 index 000000000..002e3f931 --- /dev/null +++ b/podpac/core/compositor/test/test_ordered_compositor.py @@ -0,0 +1,168 @@ +import numpy as np + +import podpac +from podpac.core.data.array_source import Array +from podpac.core.compositor.ordered_compositor import OrderedCompositor + +COORDS = podpac.Coordinates( + [podpac.clinspace(45, 0, 16), podpac.clinspace(-70, -65, 16), podpac.clinspace(0, 1, 2)], + dims=["lat", "lon", "time"], +) + +MULTI_0_XY = Array(source=np.full(COORDS.shape + (2,), 0), coordinates=COORDS, outputs=["x", "y"]) +MULTI_1_XY = Array(source=np.full(COORDS.shape + (2,), 1), coordinates=COORDS, outputs=["x", "y"]) +MULTI_4_YX = Array(source=np.full(COORDS.shape + (2,), 4), coordinates=COORDS, outputs=["y", "x"]) +MULTI_2_X = Array(source=np.full(COORDS.shape + (1,), 2), coordinates=COORDS, outputs=["x"]) +MULTI_3_Z = Array(source=np.full(COORDS.shape + (1,), 3), coordinates=COORDS, outputs=["z"]) + + +class TestOrderedCompositor(object): + def test_composite(self): + with podpac.settings: + podpac.settings["MULTITHREADING"] = False + + acoords = podpac.Coordinates([[0, 1], [10, 20, 30]], dims=["lat", "lon"]) + asource = np.ones(acoords.shape) + asource[0, :] = np.nan + a = Array(source=asource, coordinates=acoords) + + bcoords = podpac.Coordinates([[0, 1, 2], [10, 20, 30, 40]], dims=["lat", "lon"]) + bsource = np.zeros(bcoords.shape) + bsource[:, 0] = np.nan + b = Array(source=bsource, coordinates=bcoords) + + coords = podpac.Coordinates([[0, 1, 2], [10, 20, 30, 40, 50]], dims=["lat", "lon"]) + + node = OrderedCompositor(sources=[a, b], interpolation="bilinear") + expected = np.array( + [[np.nan, 0.0, 0.0, 0.0, np.nan], [1.0, 1.0, 1.0, 0.0, np.nan], [np.nan, 0.0, 0.0, 0.0, np.nan]] + ) + np.testing.assert_allclose(node.eval(coords), expected, equal_nan=True) + + node = OrderedCompositor(sources=[b, a], interpolation="bilinear") + expected = np.array( + [[np.nan, 0.0, 0.0, 0.0, np.nan], [1.0, 0.0, 0.0, 0.0, np.nan], [np.nan, 0.0, 0.0, 0.0, np.nan]] + ) + np.testing.assert_allclose(node.eval(coords), expected, equal_nan=True) + + def test_composite_multithreaded(self): + with podpac.settings: + podpac.settings["MULTITHREADING"] = True + podpac.settings["N_THREADS"] = 8 + + acoords = podpac.Coordinates([[0, 1], [10, 20, 30]], dims=["lat", "lon"]) + asource = np.ones(acoords.shape) + asource[0, :] = np.nan + a = Array(source=asource, coordinates=acoords) + + bcoords = podpac.Coordinates([[0, 1, 2], [10, 20, 30, 40]], dims=["lat", "lon"]) + bsource = np.zeros(bcoords.shape) + bsource[:, 0] = np.nan + b = Array(source=bsource, coordinates=bcoords) + + coords = podpac.Coordinates([[0, 1, 2], [10, 20, 30, 40, 50]], dims=["lat", "lon"]) + + node = OrderedCompositor(sources=[a, b], interpolation="bilinear") + expected = np.array( + [[np.nan, 0.0, 0.0, 0.0, np.nan], [1.0, 1.0, 1.0, 0.0, np.nan], [np.nan, 0.0, 0.0, 0.0, np.nan]] + ) + np.testing.assert_allclose(node.eval(coords), expected, equal_nan=True) + + node = OrderedCompositor(sources=[b, a], interpolation="bilinear") + expected = np.array( + [[np.nan, 0.0, 0.0, 0.0, np.nan], [1.0, 0.0, 0.0, 0.0, np.nan], [np.nan, 0.0, 0.0, 0.0, np.nan]] + ) + np.testing.assert_allclose(node.eval(coords), expected, equal_nan=True) + + def test_composite_short_circuit(self): + with podpac.settings: + podpac.settings["MULTITHREADING"] = False + podpac.settings["DEBUG"] = True + + coords = podpac.Coordinates([[0, 1], [10, 20, 30]], dims=["lat", "lon"]) + a = Array(source=np.ones(coords.shape), coordinates=coords) + b = Array(source=np.zeros(coords.shape), coordinates=coords) + node = OrderedCompositor(sources=[a, b], interpolation="bilinear") + output = node.eval(coords) + np.testing.assert_array_equal(output, a.source) + assert node._eval_sources[0]._output is not None + assert node._eval_sources[1]._output is None + + def test_composite_short_circuit_multithreaded(self): + with podpac.settings: + podpac.settings["MULTITHREADING"] = True + podpac.settings["N_THREADS"] = 8 + podpac.settings["DEBUG"] = True + + coords = podpac.Coordinates([[0, 1], [10, 20, 30]], dims=["lat", "lon"]) + n_threads_before = podpac.core.managers.multi_threading.thread_manager._n_threads_used + a = Array(source=np.ones(coords.shape), coordinates=coords) + b = Array(source=np.zeros(coords.shape), coordinates=coords) + node = OrderedCompositor(sources=[a, b], interpolation="bilinear") + output = node.eval(coords) + np.testing.assert_array_equal(output, a.source) + assert node._multi_threaded == True + assert podpac.core.managers.multi_threading.thread_manager._n_threads_used == n_threads_before + + def test_composite_into_result(self): + coords = podpac.Coordinates([[0, 1], [10, 20, 30]], dims=["lat", "lon"]) + a = Array(source=np.ones(coords.shape), coordinates=coords) + b = Array(source=np.zeros(coords.shape), coordinates=coords) + node = OrderedCompositor(sources=[a, b], interpolation="bilinear") + result = node.create_output_array(coords, data=np.random.random(coords.shape)) + output = node.eval(coords, output=result) + np.testing.assert_array_equal(output, a.source) + np.testing.assert_array_equal(result, a.source) + + def test_composite_multiple_outputs(self): + node = OrderedCompositor(sources=[MULTI_0_XY, MULTI_1_XY], auto_outputs=True) + output = node.eval(COORDS) + assert output.dims == ("lat", "lon", "time", "output") + np.testing.assert_array_equal(output["output"], ["x", "y"]) + np.testing.assert_array_equal(output.sel(output="x"), np.full(COORDS.shape, 0)) + np.testing.assert_array_equal(output.sel(output="y"), np.full(COORDS.shape, 0)) + + node = OrderedCompositor(sources=[MULTI_1_XY, MULTI_0_XY], auto_outputs=True) + output = node.eval(COORDS) + assert output.dims == ("lat", "lon", "time", "output") + np.testing.assert_array_equal(output["output"], ["x", "y"]) + np.testing.assert_array_equal(output.sel(output="x"), np.full(COORDS.shape, 1)) + np.testing.assert_array_equal(output.sel(output="y"), np.full(COORDS.shape, 1)) + + def test_composite_combine_multiple_outputs(self): + node = OrderedCompositor(sources=[MULTI_0_XY, MULTI_1_XY, MULTI_2_X, MULTI_3_Z], auto_outputs=True) + output = node.eval(COORDS) + assert output.dims == ("lat", "lon", "time", "output") + np.testing.assert_array_equal(output["output"], ["x", "y", "z"]) + np.testing.assert_array_equal(output.sel(output="x"), np.full(COORDS.shape, 0)) + np.testing.assert_array_equal(output.sel(output="y"), np.full(COORDS.shape, 0)) + np.testing.assert_array_equal(output.sel(output="z"), np.full(COORDS.shape, 3)) + + node = OrderedCompositor(sources=[MULTI_3_Z, MULTI_2_X, MULTI_0_XY, MULTI_1_XY], auto_outputs=True) + output = node.eval(COORDS) + assert output.dims == ("lat", "lon", "time", "output") + np.testing.assert_array_equal(output["output"], ["z", "x", "y"]) + np.testing.assert_array_equal(output.sel(output="x"), np.full(COORDS.shape, 2)) + np.testing.assert_array_equal(output.sel(output="y"), np.full(COORDS.shape, 0)) + np.testing.assert_array_equal(output.sel(output="z"), np.full(COORDS.shape, 3)) + + node = OrderedCompositor(sources=[MULTI_2_X, MULTI_4_YX], auto_outputs=True) + output = node.eval(COORDS) + assert output.dims == ("lat", "lon", "time", "output") + np.testing.assert_array_equal(output["output"], ["x", "y"]) + np.testing.assert_array_equal(output.sel(output="x"), np.full(COORDS.shape, 2)) + np.testing.assert_array_equal(output.sel(output="y"), np.full(COORDS.shape, 4)) + + def test_composite_stacked_unstacked(self): + anative = podpac.Coordinates([podpac.clinspace((0, 1), (1, 2), size=3)], dims=["lat_lon"]) + bnative = podpac.Coordinates([podpac.clinspace(-2, 3, 3), podpac.clinspace(-1, 4, 3)], dims=["lat", "lon"]) + a = Array(source=np.random.rand(3), coordinates=anative) + b = Array(source=np.random.rand(3, 3) + 2, coordinates=bnative) + + coords = podpac.Coordinates([podpac.clinspace(-3, 4, 32), podpac.clinspace(-2, 5, 32)], dims=["lat", "lon"]) + + node = OrderedCompositor(sources=[a, b], interpolation="nearest") + o = node.eval(coords) + # Check that both data sources are being used in the interpolation + assert np.any(o.data >= 2) + assert np.any(o.data <= 1) diff --git a/podpac/core/compositor/test/test_tile_compositor.py b/podpac/core/compositor/test/test_tile_compositor.py new file mode 100644 index 000000000..417b90d3f --- /dev/null +++ b/podpac/core/compositor/test/test_tile_compositor.py @@ -0,0 +1,160 @@ +import itertools + +import pytest +import numpy as np +import traitlets as tl + +import podpac +from podpac.utils import cached_property +from podpac.data import DataSource +from podpac.core.compositor.tile_compositor import TileCompositor, UniformTileCompositor, UniformTileMixin + + +class MockTile(UniformTileMixin, podpac.data.DataSource): + x = tl.Int() # used as a modifier to distinguish between tiles in the tests + data = np.arange(16).reshape(1, 4, 4) + + def get_data(self, coordinates, coordinates_index): + return self.create_output_array(coordinates, data=self.data[coordinates_index] + self.x) + + +class MockTileCompositor(UniformTileCompositor): + shape = (3, 3, 3) + + @cached_property + def sources(self): + return [ + MockTile(tile=(i, j, k), grid=self, x=20 * n) + for n, (i, j, k) in enumerate(itertools.product(range(3), range(3), range(3))) + ] + + def get_coordinates(self): + return podpac.Coordinates( + [["2018-01-01", "2018-01-03", "2018-01-05"], podpac.clinspace(0, 11, 12), podpac.clinspace(0, 11, 12)], + dims=["time", "lat", "lon"], + ) + + +class TestUniformTileMixin(object): + def test_tile_coordinates_index(self): + class MyTile(UniformTileMixin, DataSource): + pass + + grid = MockTileCompositor() + tile = MyTile(grid=grid, tile=(1, 1, 0)) + + assert tile.width == grid.tile_width + assert tile.coordinates == podpac.Coordinates( + ["2018-01-03", podpac.clinspace(4, 7, 4), podpac.clinspace(0, 3, 4)], dims=["time", "lat", "lon"] + ) + + def test_repr(self): + class MyTile(UniformTileMixin, DataSource): + pass + + grid = MockTileCompositor() + tile = MyTile(grid=grid, tile=(1, 1, 0)) + assert "MyTile" in repr(tile) + assert "tile=(1, 1, 0)" in repr(tile) + + +class TestTileCompositor(object): + def test_sources(self): + node = TileCompositor() + with pytest.raises(NotImplementedError): + node.sources + + node = MockTileCompositor() + assert len(node.sources) == 27 + assert all(isinstance(tile, MockTile) for tile in node.sources) + + def test_coordinates(self): + node = TileCompositor() + with pytest.raises(NotImplementedError): + node.coordinates + + node = MockTileCompositor() + assert node.coordinates == podpac.Coordinates( + [["2018-01-01", "2018-01-03", "2018-01-05"], podpac.clinspace(0, 11, 12), podpac.clinspace(0, 11, 12)], + dims=["time", "lat", "lon"], + ) + + +class TestUniformTileCompositor(object): + def test_tile_width(self): + node = MockTileCompositor() + assert node.tile_width == (1, 4, 4) + + def test_get_data_coordinates(self): + node = MockTileCompositor() + + # all coordinates + output = node.eval(node.coordinates) + assert np.all(np.isfinite(output)) + np.testing.assert_array_equal(output[0, :4, :4], np.arange(16).reshape(4, 4) + 0) + np.testing.assert_array_equal(output[0, :4, 4:8], np.arange(16).reshape(4, 4) + 20) + np.testing.assert_array_equal(output[0, 4:8, :4], np.arange(16).reshape(4, 4) + 60) + np.testing.assert_array_equal(output[1, :4, :4], np.arange(16).reshape(4, 4) + 180) + + # single point + output = node.eval(node.coordinates[2, 2, 2]) + np.testing.assert_array_equal(output, [[[370]]]) + + # partial tiles + output = node.eval(node.coordinates[1, 2:6, 2:4]) + np.testing.assert_array_equal(output, [[[190, 191], [194, 195], [242, 243], [246, 247]]]) + + def test_get_data_spatial_interpolation(self): + # exact times, interpolated lat/lon + c1 = podpac.Coordinates(["2018-01-01", [0.25, 0.75, 1.25], [0.25, 0.75, 1.25]], dims=["time", "lat", "lon"]) + c2 = podpac.Coordinates(["2018-01-03", [0.25, 0.75, 1.25], [0.25, 0.75, 1.25]], dims=["time", "lat", "lon"]) + + node = MockTileCompositor(interpolation="nearest") + np.testing.assert_array_equal(node.eval(c1), [[[0, 1, 1], [4, 5, 5], [4, 5, 5]]]) + np.testing.assert_array_equal(node.eval(c2), [[[180, 181, 181], [184, 185, 185], [184, 185, 185]]]) + + node = MockTileCompositor(interpolation="bilinear") + np.testing.assert_array_equal(node.eval(c1), [[[1.25, 1.75, 2.25], [3.25, 3.75, 4.25], [5.25, 5.75, 6.25]]]) + np.testing.assert_array_equal( + node.eval(c2), [[[181.25, 181.75, 182.25], [183.25, 183.75, 184.25], [185.25, 185.75, 186.25]]] + ) + + def test_get_data_time_interpolation(self): + # exact lat/lon, interpolated times + c1 = podpac.Coordinates(["2018-01-01T01:00:00", [1, 2], [1, 2]], dims=["time", "lat", "lon"]) + c2 = podpac.Coordinates(["2018-01-02T23:00:00", [1, 2], [1, 2]], dims=["time", "lat", "lon"]) + c3 = podpac.Coordinates(["2018-01-03T01:00:00", [1, 2], [1, 2]], dims=["time", "lat", "lon"]) + + node = MockTileCompositor(interpolation="nearest") + np.testing.assert_array_equal(node.eval(c1), [[[5, 6], [9, 10]]]) + np.testing.assert_array_equal(node.eval(c2), [[[185, 186], [189, 190]]]) + np.testing.assert_array_equal(node.eval(c3), [[[185, 186], [189, 190]]]) + + # TODO + # node = MockTileCompositor(interpolation='bilinear') + # np.testing.assert_array_equal(node.eval(c1), TODO) + # np.testing.assert_array_equal(node.eval(c2), TODO) + # np.testing.assert_array_equal(node.eval(c3), TODO) + + def test_get_data_interpolation(self): + # interpolated lat/lon and time + c1 = podpac.Coordinates( + ["2018-01-01T01:00:00", [0.25, 0.75, 1.25], [0.25, 0.75, 1.25]], dims=["time", "lat", "lon"] + ) + c2 = podpac.Coordinates( + ["2018-01-02T23:00:00", [0.25, 0.75, 1.25], [0.25, 0.75, 1.25]], dims=["time", "lat", "lon"] + ) + c3 = podpac.Coordinates( + ["2018-01-03T01:00:00", [0.25, 0.75, 1.25], [0.25, 0.75, 1.25]], dims=["time", "lat", "lon"] + ) + + node = MockTileCompositor(interpolation="nearest") + np.testing.assert_array_equal(node.eval(c1), [[[0, 1, 1], [4, 5, 5], [4, 5, 5]]]) + np.testing.assert_array_equal(node.eval(c2), [[[180, 181, 181], [184, 185, 185], [184, 185, 185]]]) + np.testing.assert_array_equal(node.eval(c3), [[[180, 181, 181], [184, 185, 185], [184, 185, 185]]]) + + # TODO + # node = MockTileCompositor(interpolation='bilinear') + # np.testing.assert_array_equal(node.eval(c1), TODO) + # np.testing.assert_array_equal(node.eval(c2), TODO) + # np.testing.assert_array_equal(node.eval(c3), TODO) diff --git a/podpac/core/compositor/tile_compositor.py b/podpac/core/compositor/tile_compositor.py new file mode 100644 index 000000000..0f509c2b5 --- /dev/null +++ b/podpac/core/compositor/tile_compositor.py @@ -0,0 +1,131 @@ +from __future__ import division, unicode_literals, print_function, absolute_import + +import numpy as np +import traitlets as tl + +import podpac +from podpac.core.coordinates import Coordinates +from podpac.core.units import UnitsDataArray +from podpac.core.utils import common_doc, cached_property, ind2slice +from podpac.core.data.datasource import DataSource, COMMON_DATA_DOC + + +@common_doc(COMMON_DATA_DOC) +class TileCompositor(DataSource): + """Composite tiled datasources. + + Attributes + ---------- + sources : list + The tiled data sources. + coordinates : Coordinates + Coordinates encompassing all of the tiled sources. + + Notes + ----- + This compositor aggregates source data first and then interpolates the requested coordinates. + """ + + @property + def sources(self): + """ Tiled data sources (using the TileMixin). + + Child classes should define these sources including a reference to itself and the tile_coordinates_index. + """ + + raise NotImplementedError() + + def get_data(self, coordinates, coordinates_index): + """{get_data} + """ + + output = self.create_output_array(coordinates) + for source in self.sources: + c, I = source.coordinates.intersect(coordinates, return_indices=True) + if c.size == 0: + continue + source_data = source.get_data(c, I) + output.loc[source_data.coords] = source_data + + return output + + +@common_doc(COMMON_DATA_DOC) +class UniformTileCompositor(TileCompositor): + """Composite a grid of uniformly tiled datasources. + + Attributes + ---------- + sources : list + The tiled data sources. + coordinates : Coordinates + Coordinates encompassing all of the tiled sources. + shape : tuple + shape of the tile grid + tile_width : tuple + shape of the coordinates for each tile + + Notes + ----- + This compositor aggregates source data first and then interpolates the requested coordinates. + """ + + shape = tl.Tuple() + _repr_keys = ["shape"] + + @property + def sources(self): + """ Tiled data sources (using the UniformTileMixin). + + Child classes should define these sources including a reference to itself and the tile index in the grid. + """ + + raise NotImplementedError() + + @cached_property + def tile_width(self): + """Tuple of the number of coordinates that the tile covers in each dimension.""" + return tuple(int(n / m) for n, m in zip(self.coordinates.shape, self.shape)) + + +@common_doc(COMMON_DATA_DOC) +class UniformTileMixin(podpac.Node): + """DataSource mixin for uniform tiles in a grid. + + Defines the tile coordinates from the grid coordinates using the tile position in the grid. + + Attributes + ---------- + grid : TileCompositor + tiling compositor containing the grid coordinates, grid shape, and tile sources + tile : tuple + index for this tile in the grid + width : tuple + width + """ + + grid = tl.Instance(TileCompositor) + tile = tl.Tuple() + + @tl.validate("tile") + def _validate_tile(self, d): + tile = d["value"] + if len(tile) != len(self.grid.shape): + raise ValueError("tile index does not match grid shape (%d != %d)" % (len(tile), len(self.grid.shape))) + if not all(0 <= i < n for (i, n) in zip(tile, self.grid.shape)): + raise ValueError("tile index %s out of range for grid shape %s)" % (len(tile), len(self.grid.shape))) + return tile + + @property + def width(self): + return self.grid.tile_width + + def get_coordinates(self): + """{get_coordinates} + """ + Is = tuple(slice(w * i, w * (i + 1)) for i, w in zip(self.tile, self.width)) + return self.grid.coordinates[Is] + + @property + def _repr_keys(self): + return super(UniformTileMixin, self)._repr_keys + ["tile"] diff --git a/podpac/core/coordinates/array_coordinates1d.py b/podpac/core/coordinates/array_coordinates1d.py index 446841cca..ce90b6a96 100644 --- a/podpac/core/coordinates/array_coordinates1d.py +++ b/podpac/core/coordinates/array_coordinates1d.py @@ -43,7 +43,8 @@ class ArrayCoordinates1d(Coordinates1d): """ coordinates = ArrayTrait(ndim=1, read_only=True) - coordinates.__doc__ = ":array: User-defined coordinate values" + # coordinates.__doc__ = ":array: User-defined coordinate values" + # coordinates = None def __init__(self, coordinates, name=None, ctype=None, segment_lengths=None): """ @@ -63,7 +64,9 @@ def __init__(self, coordinates, name=None, ctype=None, segment_lengths=None): """ # validate and set coordinates - self.set_trait("coordinates", make_coord_array(coordinates)) + coordinates = make_coord_array(coordinates) + self.set_trait("coordinates", coordinates) + self.not_a_trait = coordinates # precalculate once if self.coordinates.size == 0: diff --git a/podpac/core/coordinates/coordinates.py b/podpac/core/coordinates/coordinates.py index b752eb549..5ef5881af 100644 --- a/podpac/core/coordinates/coordinates.py +++ b/podpac/core/coordinates/coordinates.py @@ -74,10 +74,11 @@ class Coordinates(tl.HasTraits): """ crs = tl.Unicode(read_only=True, allow_none=True) + crs_desc = tl.Unicode(allow_none=True) _coords = OrderedDictTrait(trait=tl.Instance(BaseCoordinates), default_value=OrderedDict()) - def __init__(self, coords, dims=None, crs=None, ctype=None): + def __init__(self, coords, dims=None, crs=None, crs_desc=None, ctype=None, validate_crs=True): """ Create multidimensional coordinates. @@ -96,9 +97,13 @@ def __init__(self, coords, dims=None, crs=None, ctype=None): * 'lat', 'lon', 'alt', or 'time' for unstacked coordinates * dimension names joined by an underscore for stacked coordinates crs : str, optional - Coordinate reference system. Supports any PROJ4 or PROJ6 compliant string (https://proj.org). + Coordinate reference system. Supports PROJ4 and WKT. + crs_desc : str, optional + Abbreviated display description for the crs. ctype : str, optional Default coordinates type. One of 'point', 'midpoint', 'left', 'right'. + validate_crs : bool, optional + Use False to skip crs validation. Default True. """ if not isinstance(coords, (list, tuple, np.ndarray, xr.DataArray)): @@ -148,9 +153,20 @@ def __init__(self, coords, dims=None, crs=None, ctype=None): dcoords[dim] = c self.set_trait("_coords", dcoords) + if crs is not None: - self.set_trait("crs", crs) - super(Coordinates, self).__init__() + # validate + if validate_crs: + # raises pyproj.CRSError if invalid + CRS = pyproj.CRS(crs) + + # make sure CRS defines vertical units + if "alt" in self.udims and not CRS.is_vertical: + raise ValueError("Altitude dimension is defined, but CRS does not contain vertical unit") + + crs = self.set_trait("crs", crs) + + super(Coordinates, self).__init__(crs_desc=crs_desc) @tl.validate("_coords") def _validate_coords(self, d): @@ -174,21 +190,6 @@ def _validate_coords(self, d): def _default_crs(self): return settings["DEFAULT_CRS"] - @tl.validate("crs") - def _validate_crs(self, d): - val = d["value"] - CRS = pyproj.CRS(val) # raises pyproj.CRSError if invalid - - # make sure CRS defines vertical units - if "alt" in self.udims and not CRS.is_vertical: - raise ValueError("Altitude dimension is defined, but CRS does not contain vertical unit") - - return val - - @tl.observe("crs") - def _observe_crs(self, d): - crs = d["new"] - # ------------------------------------------------------------------------------------------------------------------ # Alternate constructors # ------------------------------------------------------------------------------------------------------------------ @@ -601,7 +602,9 @@ def __getitem__(self, index): indices.append(index[i]) i += 1 - return Coordinates([c[I] for c, I in zip(self._coords.values(), indices)], **self.properties) + return Coordinates( + [c[I] for c, I in zip(self._coords.values(), indices)], validate_crs=False, **self.properties + ) def __setitem__(self, dim, c): @@ -943,7 +946,9 @@ def drop(self, dims, ignore_missing=False): if dim not in self.dims and not ignore_missing: raise KeyError("Dimension '%s' not found in Coordinates with dims %s" % (dim, self.dims)) - return Coordinates([c for c in self._coords.values() if c.name not in dims], **self.properties) + return Coordinates( + [c for c in self._coords.values() if c.name not in dims], validate_crs=False, **self.properties + ) # do we ever need this? def udrop(self, dims, ignore_missing=False): @@ -1010,7 +1015,7 @@ def udrop(self, dims, ignore_missing=False): elif len(stacked) == 1: cs.append(stacked[0]) - return Coordinates(cs, **self.properties) + return Coordinates(cs, validate_crs=False, **self.properties) def intersect(self, other, dims=None, outer=False, return_indices=False): """ @@ -1132,13 +1137,13 @@ def select(self, bounds, return_indices=False, outer=False): def _make_selected_coordinates(self, selections, return_indices): if return_indices: - coords = Coordinates([c for c, I in selections], **self.properties) + coords = Coordinates([c for c, I in selections], validate_crs=False, **self.properties) # unbundle DepedentCoordinates indices I = [I if isinstance(c, DependentCoordinates) else [I] for c, I in selections] I = [e for l in I for e in l] return coords, tuple(I) else: - return Coordinates(selections, **self.properties) + return Coordinates(selections, validate_crs=False, **self.properties) def unique(self, return_indices=False): """ @@ -1177,7 +1182,7 @@ def unstack(self): xr.DataArray.unstack """ - return Coordinates([self[dim] for dim in self.udims], **self.properties) + return Coordinates([self[dim] for dim in self.udims], validate_crs=False, **self.properties) def iterchunks(self, shape, return_slices=False): """ @@ -1200,7 +1205,9 @@ def iterchunks(self, shape, return_slices=False): l = [[slice(i, i + n) for i in range(0, m, n)] for m, n in zip(self.shape, shape)] for slices in itertools.product(*l): - coords = Coordinates([self._coords[dim][slc] for dim, slc in zip(self.dims, slices)], **self.properties) + coords = Coordinates( + [self._coords[dim][slc] for dim, slc in zip(self.dims, slices)], validate_crs=False, **self.properties + ) if return_slices: yield coords, slices else: @@ -1256,7 +1263,7 @@ def transpose(self, *dims, **kwargs): self._coords = OrderedDict(zip(dims, coords)) return self else: - return Coordinates(coords, **self.properties) + return Coordinates(coords, validate_crs=False, **self.properties) def transform(self, crs=None): """ @@ -1330,7 +1337,7 @@ def transform(self, crs=None): from_crs = self.CRS to_crs = pyproj.CRS(crs) - # make sure to CRS defines vertical units + # make sure the CRS defines vertical units if "alt" in self.udims and not to_crs.is_vertical: raise ValueError("Altitude dimension is defined, but CRS to transform does not contain vertical unit") @@ -1376,7 +1383,7 @@ def transform(self, crs=None): # transform transformer = pyproj.Transformer.from_proj(from_crs, to_crs, always_xy=True) ts = [c._transform(transformer) for c in cs] - return Coordinates(ts, crs=input_crs) + return Coordinates(ts, crs=input_crs, validate_crs=False) # ------------------------------------------------------------------------------------------------------------------ # Operators/Magic Methods @@ -1384,7 +1391,9 @@ def transform(self, crs=None): def __repr__(self): rep = str(self.__class__.__name__) - if self.crs: + if self.crs_desc: + rep += " ({})".format(self.crs_desc) + elif self.crs: rep += " ({})".format(self.crs) for c in self._coords.values(): if isinstance(c, Coordinates1d): @@ -1424,7 +1433,7 @@ def merge_dims(coords_list): raise TypeError("Cannot merge '%s' with Coordinates" % type(coords)) if len(coords_list) == 0: - return Coordinates([]) + return Coordinates([], crs=None) # check crs crs = coords_list[0].crs @@ -1433,7 +1442,7 @@ def merge_dims(coords_list): # merge coords = sum([list(coords.values()) for coords in coords_list], []) - return Coordinates(coords, crs=crs) + return Coordinates(coords, crs=crs, validate_crs=False) def concat(coords_list): @@ -1461,7 +1470,7 @@ def concat(coords_list): raise TypeError("Cannot concat '%s' with Coordinates" % type(coords)) if not coords_list: - return Coordinates([]) + return Coordinates([], crs=None) # check crs crs = coords_list[0].crs @@ -1483,7 +1492,7 @@ def concat(coords_list): else: d[dim] = [np.concatenate([d[dim][i], s.coordinates]) for i, s in enumerate(c)] - return Coordinates(list(d.values()), dims=list(d.keys()), crs=crs) + return Coordinates(list(d.values()), dims=list(d.keys()), crs=crs, validate_crs=False) def union(coords_list): diff --git a/podpac/core/coordinates/dependent_coordinates.py b/podpac/core/coordinates/dependent_coordinates.py index 439ea03b0..6273eaad3 100644 --- a/podpac/core/coordinates/dependent_coordinates.py +++ b/podpac/core/coordinates/dependent_coordinates.py @@ -226,7 +226,7 @@ def _rep(self, dim, index=None): c = self.coordinates[index] ctype = self.ctypes[index] bounds = np.min(c), np.max(c) - return "%s(%s->%s): Bounds[%f, %f], shape%s, ctype[%s]" % ( + return "%s(%s->%s): Bounds[%s, %s], shape%s, ctype[%s]" % ( self.__class__.__name__, ",".join(self.idims), dim, @@ -600,7 +600,7 @@ def __init__(self, coordinates, name=None, ctype=None, segment_lengths=None): Coordinates1d.__init__(self, name=name, ctype=ctype, segment_lengths=segment_lengths) def __repr__(self): - return "%s(%s): Bounds[%f, %f], shape%s, ctype['%s']" % ( + return "%s(%s): Bounds[%s, %s], shape%s, ctype['%s']" % ( self.__class__.__name__, self.name or "?", self.bounds[0], diff --git a/podpac/core/coordinates/test/test_coordinates.py b/podpac/core/coordinates/test/test_coordinates.py index 673f91059..a15aea0c9 100644 --- a/podpac/core/coordinates/test/test_coordinates.py +++ b/podpac/core/coordinates/test/test_coordinates.py @@ -1420,7 +1420,7 @@ def test_intersect_dims(self): assert c2["lon"] == c["lon"][2:5] def test_intersect_crs(self): - # should change the other coordinates crs into the native coordinates crs for intersect + # should change the other coordinates crs into the coordinates crs for intersect c = Coordinates( [np.linspace(0, 10, 11), np.linspace(0, 10, 11), ["2018-01-01", "2018-01-02"]], dims=["lat", "lon", "time"] ) diff --git a/podpac/core/coordinates/utils.py b/podpac/core/coordinates/utils.py index 0a005d628..0f7beada8 100644 --- a/podpac/core/coordinates/utils.py +++ b/podpac/core/coordinates/utils.py @@ -473,7 +473,7 @@ def lower_precision_time_bounds(my_bounds, other_bounds, outer): Parameters ----------- my_bounds : List(np.datetime64) - The bounds of the native coordinates of the dataset + The bounds of the coordinates of the dataset other_bounds : List(np.datetime64) The bounds used for the selection outer : bool @@ -482,7 +482,7 @@ def lower_precision_time_bounds(my_bounds, other_bounds, outer): Returns -------- my_bounds : List(np.datetime64) - The bounds of the native coordinates of the dataset at the new precision + The bounds of the coordinates of the dataset at the new precision other_bounds : List(np.datetime64) The bounds used for the selection at the new precision, if outer == True, otherwise return original coordinates """ diff --git a/podpac/core/data/array_source.py b/podpac/core/data/array_source.py index fa5c14445..86df77d6e 100644 --- a/podpac/core/data/array_source.py +++ b/podpac/core/data/array_source.py @@ -4,6 +4,7 @@ from __future__ import division, unicode_literals, print_function, absolute_import +import warnings from collections import OrderedDict from six import string_types @@ -12,24 +13,25 @@ import pandas as pd # Core dependency of xarray from podpac.core.utils import common_doc, ArrayTrait -from podpac.core.data.datasource import COMMON_DATA_DOC, DataSource from podpac.core.cache import CacheCtrl +from podpac.core.node import NoCacheMixin from podpac.core.coordinates import Coordinates +from podpac.core.data.datasource import COMMON_DATA_DOC, DataSource -class Array(DataSource): +class Array(NoCacheMixin, DataSource): """Create a DataSource from an array -- this node is mostly meant for small experiments Attributes ---------- source : np.ndarray Numpy array containing the source data - native_coordinates : podpac.Coordinates + coordinates : podpac.Coordinates The coordinates of the source data Notes ------ - `native_coordinates` need to supplied by the user when instantiating this node. + `coordinates` need to supplied by the user when instantiating this node. This Node is not meant for large arrays, and cause issues with caching. As such, this Node override the default cache behavior as having no cache -- its data is in RAM already and caching is not helpful. @@ -43,41 +45,50 @@ class Array(DataSource): >>> coords = podpac.Coordinates([podpac.clinspace(1, 10, 10, 'time'), podpac.clinspace(1, 32, 32, 'lat'), podpac.clinspace(1, 34, 34, 'lon')]) - >>> node = podpac.data.Array(source=data, native_coordinates=coords, outputs=['R', 'G', 'B']) + >>> node = podpac.data.Array(source=data, coordinates=coords, outputs=['R', 'G', 'B']) >>> output = node.eval(coords) """ - source = ArrayTrait().tag(readonly=True) - native_coordinates = tl.Instance(Coordinates, allow_none=False).tag(attr=True) + source = ArrayTrait().tag(attr=True) + coordinates = tl.Instance(Coordinates).tag(attr=True) - @tl.default("cache_ctrl") - def _cache_ctrl_default(self): - return CacheCtrl([]) + _repr_keys = ["shape", "interpolation"] @tl.validate("source") def _validate_source(self, d): - a = d["value"] try: - a.astype(float) + d["value"].astype(float) except: - raise ValueError("Array source must be numerical") - return a + raise ValueError("Array 'source' data must be numerical") + return d["value"] def _first_init(self, **kwargs): - # If Array is being created from Node.from_definition or Node.from_json, then we have to handle the - # native coordinates specifically. This is special. No other DataSource node needs to deserialize - # native_coordinates in this way because it is implemented specifically in the node through get_coordinates - if isinstance(kwargs.get("native_coordinates"), OrderedDict): - kwargs["native_coordinates"] = Coordinates.from_definition(kwargs["native_coordinates"]) - elif isinstance(kwargs.get("native_coordinates"), string_types): - kwargs["native_coordinates"] = Coordinates.from_json(kwargs["native_coordinates"]) + # If the coordinates were supplied explicitly, they may need to be deserialized. + if isinstance(kwargs.get("coordinates"), OrderedDict): + kwargs["coordinates"] = Coordinates.from_definition(kwargs["coordinates"]) + elif isinstance(kwargs.get("coordinates"), string_types): + kwargs["coordinates"] = Coordinates.from_json(kwargs["coordinates"]) return kwargs + @property + def shape(self): + """Returns the shape of :attr:`self.source` + + Returns + ------- + tuple + Shape of :attr:`self.source` + """ + return self.source.shape + @common_doc(COMMON_DATA_DOC) def get_data(self, coordinates, coordinates_index): """{get_data} """ - s = coordinates_index - d = self.create_output_array(coordinates, data=self.source[s]) + d = self.create_output_array(coordinates, data=self.source[coordinates_index]) return d + + def set_coordinates(self, value): + """ Not needed. """ + pass diff --git a/podpac/core/data/csv_source.py b/podpac/core/data/csv_source.py new file mode 100644 index 000000000..57675d006 --- /dev/null +++ b/podpac/core/data/csv_source.py @@ -0,0 +1,152 @@ +import pandas as pd +import traitlets as tl + +from podpac.core.utils import common_doc, cached_property +from podpac.core.coordinates import Coordinates, StackedCoordinates +from podpac.core.data.datasource import COMMON_DATA_DOC, DATA_DOC +from podpac.core.data.file_source import BaseFileSource, FileKeysMixin, LoadFileMixin + + +@common_doc(COMMON_DATA_DOC) +class CSV(FileKeysMixin, LoadFileMixin, BaseFileSource): + """Create a DataSource from a .csv file. + + This class assumes that the data has a storage format such as: + header 1, header 2, header 3, ... + row1_data1, row1_data2, row1_data3, ... + row2_data1, row2_data2, row2_data3, ... + + Attributes + ---------- + source : str + Path to the csv file + header : int, None + Row number containing the column names, default 0. Use None for no header. + dataset : pd.DataFrame + Raw Pandas DataFrame used to read the data + coordinates : Coordinates + {coordinates} + data_key : str, int + data column number or column title, default 'data' + lat_key : str, int + latitude column number or column title, default 'lat' + lon_key : str, int + longitude column number or column title, default 'lon' + time_key : str, int + time column number or column title, default 'time' + alt_key : str, int + altitude column number or column title, default 'alt' + crs : str + Coordinate reference system of the coordinates + """ + + header = tl.Any(default_value=0).tag(attr=True) + lat_key = tl.Union([tl.Unicode(), tl.Int()], default_value="lat").tag(attr=True) + lon_key = tl.Union([tl.Unicode(), tl.Int()], default_value="lon").tag(attr=True) + time_key = tl.Union([tl.Unicode(), tl.Int()], default_value="time").tag(attr=True) + alt_key = tl.Union([tl.Unicode(), tl.Int()], default_value="alt").tag(attr=True) + data_key = tl.Union([tl.Unicode(), tl.Int(), tl.List(trait=tl.Unicode()), tl.List(trait=tl.Int())]).tag(attr=True) + + @tl.default("data_key") + def _default_data_key(self): + return super(CSV, self)._default_data_key() + + @tl.validate("data_key") + def _validate_data_key(self, d): + keys = d["value"] + if not isinstance(keys, list): + keys = [d["value"]] + + if isinstance(keys[0], int): + for col in keys: + if col not in self.available_data_cols: + raise ValueError("Invalid data_key %d, available columns are %s" % (col, self.available_data_cols)) + else: + for key in keys: + if key not in self.available_data_keys: + raise ValueError("Invalid data_key '%s', available keys are %s" % (key, self.available_data_keys)) + + return d["value"] + + @tl.default("outputs") + def _default_outputs(self): + if not isinstance(self.data_key, list): + return None + else: + return [self._get_key(elem) for elem in self.data_key] + + # ------------------------------------------------------------------------- + # public api methods + # ------------------------------------------------------------------------- + + def open_dataset(self, f): + return pd.read_csv(f, parse_dates=True, infer_datetime_format=True, header=self.header) + + @cached_property + def dims(self): + """ list of dataset coordinate dimensions """ + lookup = { + self._get_key(self.lat_key): "lat", + self._get_key(self.lon_key): "lon", + self._get_key(self.alt_key): "alt", + self._get_key(self.time_key): "time", + } + return [lookup[key] for key in self.dataset.columns if key in lookup] + + @cached_property + def keys(self): + """available data keys""" + return self.dataset.columns.tolist() + + @cached_property + def available_data_keys(self): + """available data keys""" + + dim_keys = [self._get_key(key) for key in [self.lat_key, self.lon_key, self.alt_key, self.time_key]] + keys = [key for key in self.keys if key not in dim_keys] + if len(keys) == 0: + raise ValueError("No data keys found in '%s'" % self.source) + return keys + + @cached_property + def available_data_cols(self): + return [self._get_col(key) for key in self.available_data_keys] + + @common_doc(COMMON_DATA_DOC) + def get_coordinates(self): + """{get_coordinates} + + Note: CSV files have StackedCoordinates. + """ + + coords = super(CSV, self).get_coordinates() + if len(coords) == 1: + return coords + stacked = StackedCoordinates(list(coords.values())) + return Coordinates([stacked], validate_crs=False, **coords.properties) + + @common_doc(COMMON_DATA_DOC) + def get_data(self, coordinates, coordinates_index): + """{get_data} + """ + + if not isinstance(self.data_key, list): + I = self._get_col(self.data_key) + else: + I = [self._get_col(key) for key in self.data_key] + data = self.dataset.iloc[coordinates_index[0], I] + return self.create_output_array(coordinates, data=data) + + # ------------------------------------------------------------------------- + # helper methods + # ------------------------------------------------------------------------- + + def _lookup_key(self, dim): + lookup = {"lat": self.lat_key, "lon": self.lon_key, "alt": self.alt_key, "time": self.time_key} + return self._get_key(lookup[dim]) + + def _get_key(self, key): + return self.dataset.columns[key] if isinstance(key, int) else key + + def _get_col(self, key): + return key if isinstance(key, int) else self.dataset.columns.get_loc(key) diff --git a/podpac/core/data/dataset_source.py b/podpac/core/data/dataset_source.py new file mode 100644 index 000000000..51285f62a --- /dev/null +++ b/podpac/core/data/dataset_source.py @@ -0,0 +1,79 @@ +import xarray as xr +import traitlets as tl + +from podpac.core.utils import common_doc, cached_property +from podpac.core.data.datasource import COMMON_DATA_DOC, DATA_DOC +from podpac.core.data.file_source import BaseFileSource, FileKeysMixin, LoadFileMixin + + +@common_doc(COMMON_DATA_DOC) +class Dataset(FileKeysMixin, LoadFileMixin, BaseFileSource): + """Create a DataSource node using xarray.open_dataset. + + Attributes + ---------- + source : str + Path to the dataset file. + In addition to local paths, file://, http://, ftp://, and s3:// transport protocols are supported. + dataset : xarray.Dataset + Dataset object. + coordinates : Coordinates + {coordinates} + data_key : str + data key, default 'data' + lat_key : str + latitude key, default 'lat' + lon_key : str + longitude key, default 'lon' + time_key : str + time key, default 'time' + alt_key : str + altitude key, default 'alt' + crs : str + Coordinate reference system of the coordinates + extra_dim : dict + In cases where the data contain dimensions other than ['lat', 'lon', 'time', 'alt'], these dimensions need to be selected. + For example, if the data contains ['lat', 'lon', 'channel'], the second channel can be selected using `extra_dim=dict(channel=1)` + """ + + # dataset = tl.Instance(xr.Dataset).tag(readonly=True) + extra_dim = tl.Dict(allow_none=True).tag(attr=True) + + @tl.default("extra_dim") + def _default_extra_dim(self): + return None + + # ------------------------------------------------------------------------- + # public api properties and methods + # ------------------------------------------------------------------------- + + def open_dataset(self, fp): + return xr.open_dataset(fp) + + def close_dataset(self): + self.dataset.close() + + @cached_property + def dims(self): + """dataset coordinate dims""" + lookup = {self.lat_key: "lat", self.lon_key: "lon", self.alt_key: "alt", self.time_key: "time"} + return [lookup[dim] for dim in self.dataset.dims] + + @cached_property + def keys(self): + return list(self.dataset.keys()) + + @common_doc(COMMON_DATA_DOC) + def get_data(self, coordinates, coordinates_index): + """{get_data} + """ + + if not isinstance(self.data_key, list): + data = self.dataset[self.data_key] + data = data.transpose(*self.dataset.dims) + else: + data = self.dataset[self.data_key].to_array(dim="output") + tdims = tuple(self.dataset.dims) + ("output",) + data = data.transpose(*tdims) + + return self.create_output_array(coordinates, data.data[coordinates_index]) diff --git a/podpac/core/data/datasource.py b/podpac/core/data/datasource.py index 0d4f91eef..d7ebee9f7 100644 --- a/podpac/core/data/datasource.py +++ b/podpac/core/data/datasource.py @@ -10,7 +10,6 @@ from copy import deepcopy import warnings import logging -from six import string_types import numpy as np import xarray as xr @@ -21,15 +20,15 @@ from podpac.core.units import UnitsDataArray from podpac.core.coordinates import Coordinates, Coordinates1d, StackedCoordinates from podpac.core.node import Node, NodeException -from podpac.core.utils import common_doc, trait_is_defined +from podpac.core.utils import common_doc from podpac.core.node import COMMON_NODE_DOC from podpac.core.node import node_eval -from podpac.core.data.interpolation import Interpolation, interpolation_trait +from podpac.core.interpolation.interpolation import Interpolation, InterpolationTrait log = logging.getLogger(__name__) DATA_DOC = { - "native_coordinates": "The coordinates of the data source.", + "coordinates": "The coordinates of the data source.", "get_data": """ This method must be defined by the data source implementing the DataSource class. When data source nodes are evaluated, this method is called with request coordinates and coordinate indexes. @@ -65,12 +64,12 @@ the data will be cast into UnitsDataArray using the returned data to fill values at the requested source coordinates. """, - "get_native_coordinates": """ - Returns a Coordinates object that describes the native coordinates of the data source. + "get_coordinates": """ + Returns a Coordinates object that describes the coordinates of the data source. In most cases, this method is defined by the data source implementing the DataSource class. - If method is not implemented by the data source, it will try to return ``self.native_coordinates`` - if ``self.native_coordinates`` is not None. + If method is not implemented by the data source, it will try to return ``self.coordinates`` + if ``self.coordinates`` is not None. Otherwise, this method will raise a NotImplementedError. @@ -134,8 +133,8 @@ class DataSource(Node): source : Any The location of the source. Depending on the child node this can be a filepath, numpy array, or dictionary as a few examples. - native_coordinates : :class:`podpac.Coordinates` - {native_coordinates} + coordinates : :class:`podpac.Coordinates` + {coordinates} interpolation : str, dict, optional {interpolation_long} nan_vals : List, optional @@ -143,21 +142,28 @@ class DataSource(Node): coordinate_index_type : str, optional Type of index to use for data source. Possible values are ``['list', 'numpy', 'xarray', 'pandas']`` Default is 'numpy' - + cache_coordinates : bool + Whether to cache coordinates using the podpac ``cache_ctrl``. Default False. + cache_output : bool + Should the node's output be cached? If not provided or None, uses default based on + settings["CACHE_DATASOURCE_OUTPUT_DEFAULT"]. If True, outputs will be cached and retrieved from cache. If False, + outputs will not be cached OR retrieved from cache (even if they exist in cache). Notes ----- - Custom DataSource Nodes must implement the :meth:`get_data` and :meth:`get_native_coordinates` methods. + Custom DataSource Nodes must implement the :meth:`get_data` and :meth:`get_coordinates` methods. """ - source = tl.Any().tag(readonly=True) - native_coordinates = tl.Instance(Coordinates).tag(readonly=True) - interpolation = interpolation_trait() + interpolation = InterpolationTrait().tag(attr=True) + nan_vals = tl.List().tag(attr=True) + coordinate_index_type = tl.Enum(["slice", "list", "numpy"], default_value="numpy") # , "xarray", "pandas"], - nan_vals = tl.List(allow_none=True).tag(attr=True) + cache_coordinates = tl.Bool(False) + cache_output = tl.Bool() # privates _interpolation = tl.Instance(Interpolation) + _coordinates = tl.Instance(Coordinates, allow_none=True, default_value=None, read_only=True) _original_requested_coordinates = tl.Instance(Coordinates, allow_none=True) _requested_source_coordinates = tl.Instance(Coordinates) @@ -165,17 +171,16 @@ class DataSource(Node): _requested_source_data = tl.Instance(UnitsDataArray) _evaluated_coordinates = tl.Instance(Coordinates) - # when native_coordinates is not defined, default calls get_native_coordinates - @tl.default("native_coordinates") - def _default_native_coordinates(self): - return self.get_native_coordinates() - # this adds a more helpful error message if user happens to try an inspect _interpolation before evaluate @tl.default("_interpolation") def _default_interpolation(self): self._set_interpolation() return self._interpolation + @tl.default("cache_output") + def _cache_output_default(self): + return settings["CACHE_DATASOURCE_OUTPUT_DEFAULT"] + # ------------------------------------------------------------------------------------------------------------------ # Properties # ------------------------------------------------------------------------------------------------------------------ @@ -212,6 +217,22 @@ def interpolators(self): else: return OrderedDict() + @property + def coordinates(self): + """{coordinates}""" + + if self._coordinates is not None: + nc = self._coordinates + elif self.cache_coordinates and self.has_cache("coordinates"): + nc = self.get_cache("coordinates") + self.set_trait("_coordinates", nc) + else: + nc = self.get_coordinates() + self.set_trait("_coordinates", nc) + if self.cache_coordinates: + self.put_cache(nc, "coordinates") + return nc + # ------------------------------------------------------------------------------------------------------------------ # Private Methods # ------------------------------------------------------------------------------------------------------------------ @@ -267,9 +288,8 @@ def _get_data(self): udata_array = udata_array.sel(output=self.output) # fill nan_vals in data array - if self.nan_vals: - for nan_val in self.nan_vals: - udata_array.data[udata_array.data == nan_val] = np.nan + for nan_val in self.nan_vals: + udata_array.data[udata_array.data == nan_val] = np.nan return udata_array @@ -282,10 +302,10 @@ def _get_data(self): def eval(self, coordinates, output=None): """Evaluates this node using the supplied coordinates. - The native coordinates are mapped to the requested coordinates, interpolated if necessary, and set to + The coordinates are mapped to the requested coordinates, interpolated if necessary, and set to `_requested_source_coordinates` with associated index `_requested_source_coordinates_index`. The requested source coordinates and index are passed to `get_data()` returning the source data at the - native coordinatesset to `_requested_source_data`. Finally `_requested_source_data` is interpolated + coordinatesset to `_requested_source_data`. Finally `_requested_source_data` is interpolated using the `interpolate` method and set to the `output` attribute of the node. @@ -323,7 +343,7 @@ def eval(self, coordinates, output=None): self._original_requested_coordinates = coordinates # check for missing dimensions - for c in self.native_coordinates.values(): + for c in self.coordinates.values(): if isinstance(c, Coordinates1d): if c.name not in coordinates.udims: raise ValueError("Cannot evaluate these coordinates, missing dim '%s'" % c.name) @@ -335,10 +355,10 @@ def eval(self, coordinates, output=None): extra = [] for c in coordinates.values(): if isinstance(c, Coordinates1d): - if c.name not in self.native_coordinates.udims: + if c.name not in self.coordinates.udims: extra.append(c.name) elif isinstance(c, StackedCoordinates): - if all(dim not in self.native_coordinates.udims for dim in c.dims): + if all(dim not in self.coordinates.udims for dim in c.dims): extra.append(c.name) coordinates = coordinates.drop(extra) @@ -346,18 +366,17 @@ def eval(self, coordinates, output=None): self._evaluated_coordinates = deepcopy(coordinates) # transform coordinates into native crs if different - if self.native_coordinates.crs.lower() != coordinates.crs.lower(): - coordinates = coordinates.transform(self.native_coordinates.crs) + if self.coordinates.crs.lower() != coordinates.crs.lower(): + coordinates = coordinates.transform(self.coordinates.crs) - # intersect the native coordinates with requested coordinates - # to get native coordinates within requested coordinates bounds + # intersect the coordinates with requested coordinates + # to get coordinates within requested coordinates bounds # TODO: support coordinate_index_type parameter to define other index types - ( - self._requested_source_coordinates, - self._requested_source_coordinates_index, - ) = self.native_coordinates.intersect(coordinates, outer=True, return_indices=True) + (self._requested_source_coordinates, self._requested_source_coordinates_index) = self.coordinates.intersect( + coordinates, outer=True, return_indices=True + ) - # if requested coordinates and native coordinates do not intersect, shortcut with nan UnitsDataArary + # if requested coordinates and coordinates do not intersect, shortcut with nan UnitsDataArary if self._requested_source_coordinates.size == 0: if output is None: output = self.create_output_array(self._evaluated_coordinates) @@ -404,7 +423,7 @@ def eval(self, coordinates, output=None): # if not provided, create output using the evaluated coordinates, or # if provided, set the order of coordinates to match the output dims - # Note that at this point the coordinates are in the same CRS as the native_coordinates + # Note that at this point the coordinates are in the same CRS as the coordinates if output is None: requested_dims = None output_dims = None @@ -436,7 +455,7 @@ def eval(self, coordinates, output=None): o = o.transpose(*output_dims) o.data[:] = output.transpose(*output_dims).data - # if requested crs is differented than native coordinates, + # if requested crs is differented than coordinates, # fabricate a new output with the original coordinates and new values if self._evaluated_coordinates.crs != coordinates.crs: output = self.create_output_array(self._evaluated_coordinates, data=output[:].values) @@ -449,15 +468,15 @@ def eval(self, coordinates, output=None): def find_coordinates(self): """ - Get the available native coordinates for the Node. For a DataSource, this is just the native_coordinates. + Get the available coordinates for the Node. For a DataSource, this is just the coordinates. Returns ------- coords_list : list - singleton list containing the native_coordinates (Coordinates object) + singleton list containing the coordinates (Coordinates object) """ - return [self.native_coordinates] + return [self.coordinates] @common_doc(COMMON_DATA_DOC) def get_data(self, coordinates, coordinates_index): @@ -471,79 +490,26 @@ def get_data(self, coordinates, coordinates_index): raise NotImplementedError @common_doc(COMMON_DATA_DOC) - def get_native_coordinates(self): - """{get_native_coordinates} + def get_coordinates(self): + """{get_coordinates} Raises - -------- + ------ NotImplementedError - Raised if get_native_coordinates is not implemented by data source subclass. - """ - - if trait_is_defined(self, "native_coordinates"): - return self.native_coordinates - else: - raise NotImplementedError( - "{0}.native_coordinates is not defined and " - "{0}.get_native_coordinates() is not implemented".format(self.__class__.__name__) - ) - - @property - @common_doc(COMMON_DATA_DOC) - def base_definition(self): - """Base node definition for DataSource nodes. - - Returns - ------- - {definition_return} + This needs to be implemented by derived classes """ + raise NotImplementedError - d = super(DataSource, self).base_definition - - # check attrs and remove unnecesary attrs - attrs = d.get("attrs", {}) - if "source" in attrs: - raise NodeException("The 'source' property cannot be tagged as an 'attr'") - if "interpolation" in attrs: - raise NodeException("The 'interpolation' property cannot be tagged as an 'attr'") - if "nan_vals" in attrs and not self.nan_vals: - del attrs["nan_vals"] - - # set source or lookup_source - if isinstance(self.source, Node): - d["lookup_source"] = self.source - elif isinstance(self.source, np.ndarray): - d["source"] = self.source.tolist() - else: - d["source"] = self.source + def set_coordinates(self, coordinates, force=False): + """ Set the coordinates. Used by Compositors as an optimization. - # assign the interpolation definition - d["interpolation"] = self.interpolation + Arguments + --------- + coordinates : :class:`podpac.Coordinates` + Coordinates to set. Usually these are coordinates that are shared across compositor sources. - return d + NOTE: This is only currently used by SMAPCompositor. It should potentially be moved to the SMAPSource. + """ - # ------------------------------------------------------------------------------------------------------------------ - # Operators/Magic Methods - # ------------------------------------------------------------------------------------------------------------------ - def __repr__(self): - source_name = str(self.__class__.__name__) - - rep = "{}".format(source_name) - if source_name != "DataSource": - rep += " DataSource" - - source_disp = self.source if isinstance(self.source, string_types) else "\n{}".format(self.source) - rep += "\n\tsource: {}".format(source_disp) - if trait_is_defined(self, "native_coordinates"): - rep += "\n\tnative_coordinates: " - for c in self.native_coordinates.values(): - if isinstance(c, Coordinates1d): - rep += "\n\t\t%s: %s" % (c.name, c) - elif isinstance(c, StackedCoordinates): - for _c in c: - rep += "\n\t\t%s[%s]: %s" % (c.name, _c.name, _c) - - # rep += '{}: {}'.format(c.name, c) - rep += "\n\tinterpolation: {}".format(self.interpolation) - - return rep + if not self.trait_is_defined("_coordinates"): + self.set_trait("_coordinates", coordinates) diff --git a/podpac/core/data/file.py b/podpac/core/data/file.py deleted file mode 100644 index 25995b044..000000000 --- a/podpac/core/data/file.py +++ /dev/null @@ -1,856 +0,0 @@ -""" -Datasources from files -""" - -from __future__ import division, unicode_literals, print_function, absolute_import - -from io import BytesIO -from collections import OrderedDict -from six import string_types - -import numpy as np -import traitlets as tl -import pandas as pd -import xarray as xr -import pyproj -import logging - -from podpac.core.settings import settings -from podpac.core.utils import common_doc, trait_is_defined -from podpac.core.data.datasource import COMMON_DATA_DOC, DataSource -from podpac.core.coordinates import Coordinates, UniformCoordinates1d, ArrayCoordinates1d, StackedCoordinates -from podpac.core.coordinates import RotatedCoordinates -from podpac.core.coordinates.utils import Dimension, VALID_DIMENSION_NAMES - -# Optional dependencies -from lazy_import import lazy_module, lazy_class - -rasterio = lazy_module("rasterio") -h5py = lazy_module("h5py") -boto3 = lazy_module("boto3") -requests = lazy_module("requests") -zarr = lazy_module("zarr") -zarrGroup = lazy_class("zarr.Group") -s3fs = lazy_module("s3fs") - -# Set up logging -_logger = logging.getLogger(__name__) - - -@common_doc(COMMON_DATA_DOC) -class DatasetSource(DataSource): - """ - Base class for dataset/file datasources. - - This class facilitates setting the native_coordinates from the coordinates defined in the file, including - decoding datetimes when necessary. The coordinates are automatically read from the dataset when possible, and - methods are provided for customization when necessary. - - Attributes - ---------- - source : str - Path to the data source file. - dataset - Dataset object. - native_coordinates : Coordinates - {native_coordinates} - lat_key : str - latitude key, default 'lat' - lon_key : str - longitude key, default 'lon' - time_key : str - time key, default 'time' - alt_key : str - altitude key, default 'alt' - data_key : str - data key - output_keys : list - list of data keys, for multiple-output nodes - crs : str - Coordinate reference system of the coordinates. - cf_time : bool - decode CF datetimes - cf_units : str - units, when decoding CF datetimes - cf_calendar : str - calendar, when decoding CF datetimes - """ - - source = tl.Unicode(default_value=None, allow_none=True).tag(readonly=True) - data_key = tl.Unicode(allow_none=True).tag(attr=True) - output_keys = tl.List(allow_none=True).tag(attr=True) - lat_key = tl.Unicode(allow_none=True, default_value="lat").tag(attr=True) - lon_key = tl.Unicode(allow_none=True, default_value="lon").tag(attr=True) - time_key = tl.Unicode(allow_none=True, default_value="time").tag(attr=True) - alt_key = tl.Unicode(allow_none=True, default_value="alt").tag(attr=True) - crs = tl.Unicode(allow_none=True, default_value=None).tag(attr=True) - cf_time = tl.Bool(False).tag(attr=True) - cf_units = tl.Unicode(allow_none=True).tag(attr=True) - cf_calendar = tl.Unicode(allow_none=True).tag(attr=True) - - dataset = tl.Any().tag(readonly=True) - - @tl.default("data_key") - def _default_data_key(self): - return None - - @tl.default("output_keys") - def _default_output_keys(self): - return None - - @tl.validate("output") - def _validate_output(self, d): - return d["value"] - - def init(self): - super(DatasetSource, self).init() - - # check the dataset and dims - self.dataset - self.dims - - # validation and defaults for data_key, output_keys, outputs, and output - if self.data_key is not None and self.output_keys is not None: - raise TypeError("%s cannot have both 'data_key' or 'output_keys' defined" % self.__class__.__name__) - - if self.data_key is None and self.output_keys is None: - available_keys = self.available_keys - if len(available_keys) == 1: - self.set_trait("data_key", available_keys[0]) - else: - self.set_trait("output_keys", available_keys) - - if self.outputs is not None: - if self.data_key is not None: - raise TypeError("outputs must be None for single-output nodes") - if len(self.outputs) != len(self.output_keys): - raise ValueError( - "outputs and output_keys size mismatch (%d != %d)" % (len(self.outputs), len(self.output_keys)) - ) - else: - self.set_trait("outputs", self.output_keys) - - if self.output is not None: - if self.outputs is None: - raise TypeError("Invalid output '%s' (output must be None for single-output nodes)." % self.output) - if self.output not in self.outputs: - raise ValueError("Invalid output '%s' (available outputs are %s)" % (self.output, self.outputs)) - - @common_doc(COMMON_DATA_DOC) - def get_native_coordinates(self): - """{native_coordinates} - """ - cs = [] - dims = self.dims - for dim in dims: - if dim == "lat": - cs.append(self.get_lat()) - elif dim == "lon": - cs.append(self.get_lon()) - elif dim == "time": - cs.append(self.get_time()) - elif dim == "alt": - cs.append(self.get_alt()) - - return Coordinates(cs, dims=dims, crs=self.crs) - - def get_lat(self): - """Get the native latitude coordinates from the dataset.""" - return self.dataset[self.lat_key] - - def get_lon(self): - """Get the native longitude coordinates from the dataset.""" - return self.dataset[self.lon_key] - - def get_time(self): - """Get the native time coordinates from the dataset, decoding datetimes if requested.""" - values = self.dataset[self.time_key] - if self.cf_time: - values = xr.coding.times.decode_cf_datetime(values, self.cf_units, self.cf_calendar) - return values - - def get_alt(self): - """Get the native altitude coordinates from the dataset.""" - return self.dataset[self.alt_key] - - def close_dataset(self): - """ Close the dataset. Subclasses should implement as needed. """ - pass - - @property - def dims(self): - raise NotImplementedError - - @property - def available_keys(self): - raise NotImplementedError - - @property - @common_doc(COMMON_DATA_DOC) - def base_definition(self): - """Base node definition for DatasetSource nodes. - - Returns - ------- - {definition_return} - """ - - d = super(DatasetSource, self).base_definition - - # remove unnecessary attrs - attrs = d.get("attrs", {}) - if self.data_key is None and "data_key" in attrs: - del attrs["data_key"] - if self.output_keys is None and "output_keys" in attrs: - del attrs["output_keys"] - if self.crs is None and "crs" in attrs: - del attrs["crs"] - if self.outputs == self.output_keys and "outputs" in attrs: - del attrs["outputs"] - if "lat" not in self.dims and "lat_key" in attrs: - del attrs["lat_key"] - if "lon" not in self.dims and "lon_key" in attrs: - del attrs["lon_key"] - if "alt" not in self.dims and "alt_key" in attrs: - del attrs["alt_key"] - if "time" not in self.dims and "time_key" in attrs: - del attrs["time_key"] - if self.cf_time is False: - if "cf_time" in attrs: - del attrs["cf_time"] - if "cf_units" in attrs: - del attrs["cf_units"] - if "cf_calendar" in attrs: - del attrs["cf_calendar"] - - return d - - -@common_doc(COMMON_DATA_DOC) -class Dataset(DatasetSource): - """Create a DataSource node using xarray.open_dataset. - - Attributes - ---------- - source : str - Path to the dataset file. - dataset : xarray.Dataset - Dataset object. - native_coordinates : Coordinates - {native_coordinates} - data_key : str - data key, default 'data' - lat_key : str - latitude key, default 'lat' - lon_key : str - longitude key, default 'lon' - time_key : str - time key, default 'time' - alt_key : str - altitude key, default 'alt' - crs : str - Coordinate reference system of the coordinates - extra_dim : dict - In cases where the data contain dimensions other than ['lat', 'lon', 'time', 'alt'], these dimensions need to be selected. - For example, if the data contains ['lat', 'lon', 'channel'], the second channel can be selected using `extra_dim=dict(channel=1)` - """ - - dataset = tl.Instance(xr.Dataset).tag(readonly=True) - - # node attrs - extra_dim = tl.Dict(allow_none=True, default_value=None).tag(attr=True) - - @tl.default("dataset") - def _open_dataset(self): - return xr.open_dataset(self.source) - - def close_dataset(self): - self.dataset.close() - - @common_doc(COMMON_DATA_DOC) - def get_data(self, coordinates, coordinates_index): - """{get_data} - """ - if self.data_key is not None: - data = self.dataset[self.data_key] - data = data.transpose(*self.dataset.dims) - else: - data = self.dataset[self.output_keys].to_array(dim="output") - tdims = tuple(self.dataset.dims) + ("output",) - data = data.transpose(*tdims) - return self.create_output_array(coordinates, data.data[coordinates_index]) - - @property - def dims(self): - """dataset coordinate dims""" - lookup = {self.lat_key: "lat", self.lon_key: "lon", self.alt_key: "alt", self.time_key: "time"} - for dim in self.dataset.dims: - if dim not in lookup: - raise ValueError( - "Unexpected dimension '%s' in xarray dataset (source '%s'). " - "Use 'lat_key', 'lon_key', 'time_key' and 'alt_key' to select dataset dimensions" - % (dim, self.source) - ) - - return [lookup[dim] for dim in self.dataset.dims] - - @property - def available_keys(self): - """available data keys""" - return list(self.dataset.keys()) - - @property - @common_doc(COMMON_DATA_DOC) - def base_definition(self): - """Base node definition for DatasetSource nodes. - - Returns - ------- - {definition_return} - """ - - d = super(Dataset, self).base_definition - - # remove unnecessary attrs - attrs = d.get("attrs", {}) - if self.extra_dim is None and "extra_dim" in attrs: - del attrs["extra_dim"] - - return d - - -@common_doc(COMMON_DATA_DOC) -class CSV(DatasetSource): - """Create a DataSource from a .csv file. - - This class assumes that the data has a storage format such as: - header 1, header 2, header 3, ... - row1_data1, row1_data2, row1_data3, ... - row2_data1, row2_data2, row2_data3, ... - - Attributes - ---------- - source : str - Path to the csv file - header : int, None - Row number containing the column names, default 0. Use None for no header. - dataset : pd.DataFrame - Raw Pandas DataFrame used to read the data - native_coordinates : Coordinates - {native_coordinates} - data_key : str, int - data column number or column title, default 'data' - lat_key : str, int - latitude column number or column title, default 'lat' - lon_key : str, int - longitude column number or column title, default 'lon' - time_key : str, int - time column number or column title, default 'time' - alt_key : str, int - altitude column number or column title, default 'alt' - crs : str - Coordinate reference system of the coordinates - """ - - header = tl.Any(default_value=0).tag(attr=True) - lat_key = tl.Union([tl.Unicode(), tl.Int()], default_value="lat").tag(attr=True) - lon_key = tl.Union([tl.Unicode(), tl.Int()], default_value="lon").tag(attr=True) - time_key = tl.Union([tl.Unicode(), tl.Int()], default_value="time").tag(attr=True) - alt_key = tl.Union([tl.Unicode(), tl.Int()], default_value="alt").tag(attr=True) - data_key = tl.Union([tl.Unicode(), tl.Int()], allow_none=True, default_value=None).tag(attr=True) - output_keys = tl.Union([tl.List(tl.Unicode()), tl.List(tl.Int())], allow_none=True, default_value=None).tag( - attr=True - ) - - dataset = tl.Instance(pd.DataFrame).tag(readonly=True) - - @tl.default("dataset") - def _open_dataset(self): - return pd.read_csv(self.source, parse_dates=True, infer_datetime_format=True, header=self.header) - - def _get_key(self, key): - return self.dataset.columns[key] if isinstance(key, int) else key - - def _get_col(self, key): - return key if isinstance(key, int) else self.dataset.columns.get_loc(key) - - def get_lat(self): - """Get latitude coordinates from the csv file.""" - return self.dataset[self._get_key(self.lat_key)].values - - def get_lon(self): - """Get longitude coordinates from the csv file.""" - return self.dataset[self._get_key(self.lon_key)].values - - def get_time(self): - """Get time coordinates from the csv file.""" - return self.dataset[self._get_key(self.time_key)].values - - def get_alt(self): - """Get altitude coordinates from the csv file.""" - return self.dataset[self._get_key(self.alt_key)].values - - @common_doc(COMMON_DATA_DOC) - def get_native_coordinates(self): - """{get_native_coordinates} - - Note: CSV files have StackedCoordinates. - """ - - coords = super(CSV, self).get_native_coordinates() - if len(coords) == 1: - return coords - stacked = StackedCoordinates(list(coords.values())) - return Coordinates([stacked], **coords.properties) - - @common_doc(COMMON_DATA_DOC) - def get_data(self, coordinates, coordinates_index): - """{get_data} - """ - if self.data_key is not None: - I = self._get_col(self.data_key) - else: - I = [self._get_col(key) for key in self.output_keys] - data = self.dataset.iloc[coordinates_index[0], I] - return self.create_output_array(coordinates, data=data) - - @property - def dims(self): - """dataset coordinate dims""" - lookup = { - self._get_key(self.lat_key): "lat", - self._get_key(self.lon_key): "lon", - self._get_key(self.alt_key): "alt", - self._get_key(self.time_key): "time", - } - return [lookup[key] for key in self.dataset.columns if key in lookup] - - @property - def available_keys(self): - """available data keys""" - dims_keys = [self.lat_key, self.lon_key, self.alt_key, self.time_key] - return [key for key in self.dataset.columns if key not in dims_keys] - - @property - @common_doc(COMMON_DATA_DOC) - def base_definition(self): - """Base node definition for DatasetSource nodes. - - Returns - ------- - {definition_return} - """ - - d = super(CSV, self).base_definition - - # remove unnecessary attrs - attrs = d.get("attrs", {}) - if self.header == 0 and "header" in attrs: - del attrs["header"] - - return d - - -@common_doc(COMMON_DATA_DOC) -class H5PY(DatasetSource): - """Create a DataSource node using h5py. - - Attributes - ---------- - source : str - Path to the h5py file - dataset : h5py.File - The h5py file object used to read the file - native_coordinates : Coordinates - {native_coordinates} - file_mode : str, optional - Default is 'r'. The mode used to open the HDF5 file. Options are r, r+, w, w- or x, a (see h5py.File). - data_key : str, int - data key, default 'data' - lat_key : str, int - latitude coordinates key, default 'lat' - lon_key : str, int - longitude coordinates key, default 'lon' - time_key : str, int - time coordinates key, default 'time' - alt_key : str, int - altitude coordinates key, default 'alt' - crs : str - Coordinate reference system of the coordinates - cf_time : bool - decode CF datetimes - cf_units : str - units, when decoding CF datetimes - cf_calendar : str - calendar, when decoding CF datetimes - """ - - file_mode = tl.Unicode(default_value="r").tag(readonly=True) - - dims = tl.List(allow_none=False) - - @tl.default("dims") - def _dims_default(self): - """dataset coordinate dims""" - key = self.data_key - if key is None: - key = self.available_keys[0] - try: - dims = self.dataset[key].attrs["_ARRAY_DIMENSIONS"] - except: - lookup = {self.lat_key: "lat", self.lon_key: "lon", self.alt_key: "alt", self.time_key: "time"} - dims = [lookup[key] for key in H5PY._find_h5py_keys(self.dataset) if key in lookup] - return dims - - @tl.default("dataset") - def _open_dataset(self): - # TODO: dataset should not open by default - # prefer with as: syntax - return h5py.File(self.source, self.file_mode) - - def close_dataset(self): - """Closes the file. """ - self.dataset.close() - - @common_doc(COMMON_DATA_DOC) - def get_data(self, coordinates, coordinates_index): - """{get_data} - """ - data = self.create_output_array(coordinates) - if self.data_key is not None: - data[:] = self.dataset[self.data_key][coordinates_index] - else: - for key, name in zip(self.output_keys, self.outputs): - data.sel(output=name)[:] = self.dataset[key][coordinates_index] - return data - - def attrs(self, key="/"): - """Dataset or group key for which attributes will be summarized. - """ - return dict(self.dataset[key].attrs) - - @property - def available_keys(self): - dims_keys = [self.lat_key, self.lon_key, self.alt_key, self.time_key] - return [key for key in H5PY._find_h5py_keys(self.dataset) if key not in dims_keys] - - @staticmethod - def _find_h5py_keys(obj, keys=[]): - if isinstance(obj, (h5py.Group, h5py.File)): - for k in obj.keys(): - keys = H5PY._find_h5py_keys(obj[k], keys) - else: - keys.append(obj.name) - return keys - keys = list(set(keys)) - keys.sort() - return keys - - -class Zarr(DatasetSource): - """Create a DataSource node using zarr. - - Attributes - ---------- - source : str - Path to the Zarr archive - file_mode : str, optional - Default is 'r'. The mode used to open the Zarr archive. Options are r, r+, w, w- or x, a. - dataset : zarr.Group - The h5py file object used to read the file - native_coordinates : Coordinates - {native_coordinates} - data_key : str, int - data key, default 'data' - lat_key : str, int - latitude coordinates key, default 'lat' - lon_key : str, int - longitude coordinates key, default 'lon' - time_key : str, int - time coordinates key, default 'time' - alt_key : str, int - altitude coordinates key, default 'alt' - crs : str - Coordinate reference system of the coordinates - cf_time : bool - decode CF datetimes - cf_units : str - units, when decoding CF datetimes - cf_calendar : str - calendar, when decoding CF datetimes - """ - - file_mode = tl.Unicode(default_value="r").tag(readonly=True) - - # optional inputs - access_key_id = tl.Unicode() - secret_access_key = tl.Unicode() - region_name = tl.Unicode() - dims = tl.List(allow_none=False) - coordinate_index_type = "slice" - - @tl.default("dims") - def _dims_default(self): - """dataset coordinate dims""" - key = self.data_key - if key is None: - key = self.available_keys[0] - try: - dims = self.dataset[key].attrs["_ARRAY_DIMENSIONS"] - except: - lookup = {self.lat_key: "lat", self.lon_key: "lon", self.alt_key: "alt", self.time_key: "time"} - dims = [lookup[key] for key in self.dataset if key in lookup] - return dims - - @tl.default("access_key_id") - def _get_access_key_id(self): - return settings["AWS_ACCESS_KEY_ID"] - - @tl.default("secret_access_key") - def _get_secret_access_key(self): - return settings["AWS_SECRET_ACCESS_KEY"] - - @tl.default("region_name") - def _get_region_name(self): - return settings["AWS_REGION_NAME"] - - @tl.default("dataset") - def _open_dataset(self): - if self.source is None: - raise TypeError("Zarr node requires 'source' or 'dataset'") - - if self.source.startswith("s3://"): - root = self.source.strip("s3://") - kwargs = {"region_name": self.region_name} - s3 = s3fs.S3FileSystem(key=self.access_key_id, secret=self.secret_access_key, client_kwargs=kwargs) - s3map = s3fs.S3Map(root=root, s3=s3, check=False) - store = s3map - else: - store = str(self.source) # has to be a string in Python2.7 for local files - - try: - return zarr.open(store, mode=self.file_mode) - except ValueError: - raise ValueError("No Zarr store found at path '%s'" % self.source) - - @common_doc(COMMON_DATA_DOC) - def get_data(self, coordinates, coordinates_index): - """{get_data} - """ - data = self.create_output_array(coordinates) - if self.data_key is not None: - data[:] = self.dataset[self.data_key][coordinates_index] - else: - for key, name in zip(self.output_keys, self.outputs): - data.sel(output=name)[:] = self.dataset[key][coordinates_index] - return data - - @property - def available_keys(self): - """available data keys""" - dim_keys = [self.lat_key, self.lon_key, self.alt_key, self.time_key] - return [key for key in self.dataset if key not in dim_keys] - - -# TODO -@common_doc(COMMON_DATA_DOC) -class Rasterio(DataSource): - r"""Create a DataSource using Rasterio. - - Parameters - ---------- - source : str, :class:`io.BytesIO` - Path to the data source - band : int - The 'band' or index for the variable being accessed in files such as GeoTIFFs - - Attributes - ---------- - dataset : :class:`rasterio._io.RasterReader` - A reference to the datasource opened by rasterio - native_coordinates : :class:`podpac.Coordinates` - {native_coordinates} - crs : str, optional - The coordinate reference system. Normally this will come directly from the file, but this allows users to - specify the crs in case this information is missing from the file. - - Notes - ------ - The source could be a path to an s3 bucket file, e.g.: s3://landsat-pds/L8/139/045/LC81390452014295LGN00/LC81390452014295LGN00_B1.TIF - In that case, make sure to set the environmental variable: - * Windows: set CURL_CA_BUNDLE=\Library\ssl\cacert.pem - * Linux: export CURL_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt - """ - - source = tl.Union([tl.Unicode(), tl.Instance(BytesIO)]).tag(readonly=True) - dataset = tl.Any().tag(readonly=True) - crs = tl.Unicode(allow_none=True) - - @property - def nan_vals(self): - return list(self.dataset.nodatavals) - - # node attrs - band = tl.CInt(allow_none=True).tag(attr=True) - - @tl.default("band") - def _band_default(self): - if (self.outputs is not None) and (self.output is not None): - band = self.outputs.index(self.output) - elif self.outputs is None: - band = 1 - else: - band = None # All bands - return band - - @tl.default("dataset") - def _open_dataset(self): - """Opens the data source - - Returns - ------- - :class:`rasterio.io.DatasetReader` - Rasterio dataset - """ - - # TODO: dataset should not open by default - # prefer with as: syntax - - if isinstance(self.source, BytesIO): - # https://rasterio.readthedocs.io/en/latest/topics/memory-files.html - # TODO: this is still not working quite right - likely need to work - # out the BytesIO format or how we are going to read/write in memory - with rasterio.MemoryFile(self.source) as memfile: - return memfile.open(driver="GTiff") - - # local file - else: - return rasterio.open(self.source) - - def close_dataset(self): - """Closes the file for the datasource - """ - self.dataset.close() - - @common_doc(COMMON_DATA_DOC) - def get_native_coordinates(self): - """{get_native_coordinates} - - The default implementation tries to find the lat/lon coordinates based on dataset.affine. - It cannot determine the alt or time dimensions, so child classes may - have to overload this method. - """ - - # check to see if the coordinates are rotated used affine - affine = self.dataset.transform - - if isinstance(self.dataset.crs, rasterio.crs.CRS) and "init" in self.dataset.crs: - crs = self.dataset.crs["init"].upper() - elif isinstance(self.dataset.crs, dict) and "init" in self.dataset.crs: - crs = self.dataset.crs["init"].upper() - else: - try: - crs = pyproj.CRS(self.dataset.crs).to_wkt() - except pyproj.exceptions.CRSError: - if self.crs is None: - raise RuntimeError("Unexpected rasterio crs '%s'" % self.dataset.crs) - else: - crs = self.crs - - return Coordinates.from_geotransform(affine.to_gdal(), self.dataset.shape, crs) - - @common_doc(COMMON_DATA_DOC) - def get_data(self, coordinates, coordinates_index): - """{get_data} - """ - data = self.create_output_array(coordinates) - slc = coordinates_index - - # read data within coordinates_index window - window = ((slc[0].start, slc[0].stop), (slc[1].start, slc[1].stop)) - - if self.outputs is not None: # read all the bands - raster_data = self.dataset.read(out_shape=(len(self.outputs),) + tuple(coordinates.shape), window=window) - raster_data = np.moveaxis(raster_data, 0, 2) - else: # read the requested band - raster_data = self.dataset.read(self.band, out_shape=tuple(coordinates.shape), window=window) - - # set raster data to output array - data.data.ravel()[:] = raster_data.ravel() - return data - - @property - def band_count(self): - """The number of bands - - Returns - ------- - int - The number of bands in the dataset - """ - - if not hasattr(self, "_band_count"): - self._band_count = self.dataset.count - - return self._band_count - - @property - def band_descriptions(self): - """A description of each band contained in dataset.tags - - Returns - ------- - OrderedDict - Dictionary of band_number: band_description pairs. The band_description values are a dictionary, each - containing a number of keys -- depending on the metadata - """ - - if not hasattr(self, "_band_descriptions"): - self._band_descriptions = OrderedDict((i, self.dataset.tags(i + 1)) for i in range(self.band_count)) - - return self._band_descriptions - - @property - def band_keys(self): - """An alternative view of band_descriptions based on the keys present in the metadata - - Returns - ------- - dict - Dictionary of metadata keys, where the values are the value of the key for each band. - For example, band_keys['TIME'] = ['2015', '2016', '2017'] for a dataset with three bands. - """ - - if not hasattr(self, "_band_keys"): - keys = {k for i in range(self.band_count) for k in self.band_descriptions[i]} # set - self._band_keys = {k: [self.band_descriptions[i].get(k) for i in range(self.band_count)] for k in keys} - - return self._band_keys - - def get_band_numbers(self, key, value): - """Return the bands that have a key equal to a specified value. - - Parameters - ---------- - key : str / list - Key present in the metadata of the band. Can be a single key, or a list of keys. - value : str / list - Value of the key that should be returned. Can be a single value, or a list of values - - Returns - ------- - np.ndarray - An array of band numbers that match the criteria - """ - if (not hasattr(key, "__iter__") or isinstance(key, string_types)) and ( - not hasattr(value, "__iter__") or isinstance(value, string_types) - ): - key = [key] - value = [value] - - match = np.ones(self.band_count, bool) - for k, v in zip(key, value): - match = match & (np.array(self.band_keys[k]) == v) - matches = np.where(match)[0] + 1 - - return matches diff --git a/podpac/core/data/file_source.py b/podpac/core/data/file_source.py new file mode 100644 index 000000000..2749da19c --- /dev/null +++ b/podpac/core/data/file_source.py @@ -0,0 +1,249 @@ +""" +Datasources from files +""" + +from __future__ import division, unicode_literals, print_function, absolute_import + +import sys + +if sys.version_info.major == 2: + from urllib2 import urlopen +else: + from urllib.request import urlopen + +from io import BytesIO +import logging + +import traitlets as tl +import xarray as xr + +from lazy_import import lazy_module, lazy_class + +boto3 = lazy_module("boto3") +s3fs = lazy_module("s3fs") +requests = lazy_module("requests") + +from podpac.core.utils import common_doc, cached_property +from podpac.core.coordinates import Coordinates +from podpac.core.authentication import S3Mixin +from podpac.core.data.datasource import COMMON_DATA_DOC, DataSource + +# TODO common doc +_logger = logging.getLogger(__name__) + + +class BaseFileSource(DataSource): + """ + Base class for data sources loaded from file. + + Attributes + ---------- + source : str + Path to the data source. + coordinates : Coordinates + {coordinates} + dataset : Any + dataset object + """ + + source = tl.Unicode().tag(attr=True) + + # list of attribute names, used by __repr__ and __str__ to display minimal info about the node + _repr_keys = ["source", "interpolation"] + + @tl.default("source") + def _default_source(self): + raise ValueError("%s 'source' required" % self.__class__.__name__) + + # ------------------------------------------------------------------------- + # public api properties and methods + # ------------------------------------------------------------------------- + + @property + def dataset(self): + raise NotImplementedError() + + def close_dataset(self): + """ Close opened resources. Subclasses should implement if appropriate. """ + pass + + +class LoadFileMixin(S3Mixin): + """ + Mixin to load and cache files using various transport protocols. + + Attributes + ---------- + cache_dataset : bool + Whether to cache the dataset after loading. + """ + + cache_dataset = tl.Bool(True) + + @cached_property + def _dataset_caching_node(self): + # stub node containing only the source node attr + return BaseFileSource(source=self.source, cache_ctrl=self.cache_ctrl) + + @cached_property + def dataset(self): + # use the _dataset_caching_node "stub" here because the only node attr we care about is the source + if self._dataset_caching_node.has_cache(key="dataset"): + data = self._dataset_caching_node.get_cache(key="dataset") + with BytesIO(data) as f: + return self._open(BytesIO(data), cache=False) + elif self.source.startswith("s3://"): + _logger.info("Loading AWS resource: %s" % self.source) + with self.s3.open(self.source, "rb") as f: + return self._open(f) + elif self.source.startswith("http://") or self.source.startswith("https://"): + _logger.info("Downloading: %s" % self.source) + response = requests.get(self.source) + with BytesIO(response.content) as f: + return self._open(f) + elif self.source.startswith("ftp://"): + _logger.info("Downloading: %s" % self.source) + addinfourl = urlopen(self.source) + with BytesIO(addinfourl.read()) as f: + return self._open(f) + elif self.source.startswith("file://"): + addinfourl = urlopen(self.source) + with BytesIO(addinfourl.read()) as f: + return self._open(f) + else: + with open(self.source, "rb") as f: + return self._open(f) + + def _open(self, f, cache=True): + if self.cache_dataset and cache: + self._dataset_caching_node.put_cache(f.read(), key="dataset") + f.seek(0) + return self.open_dataset(f) + + def open_dataset(self, f): + """ TODO """ + raise NotImplementedError() + + +@common_doc(COMMON_DATA_DOC) +class FileKeysMixin(tl.HasTraits): + """ + Mixin to specify data and coordinates dimensions keys. + + Attributes + ---------- + lat_key : str + latitude key, default 'lat' + lon_key : str + longitude key, default 'lon' + time_key : str + time key, default 'time' + alt_key : str + altitude key, default 'alt' + data_key : str, list + data key, or list of data keys for multiple-output nodes + crs : str + Coordinate reference system of the coordinates. + cf_time : bool + decode CF datetimes + cf_units : str + units, when decoding CF datetimes + cf_calendar : str + calendar, when decoding CF datetimes + """ + + data_key = tl.Union([tl.Unicode(), tl.List(trait=tl.Unicode())]).tag(attr=True) + lat_key = tl.Unicode(default_value="lat").tag(attr=True) + lon_key = tl.Unicode(default_value="lon").tag(attr=True) + time_key = tl.Unicode(default_value="time").tag(attr=True) + alt_key = tl.Unicode(default_value="alt").tag(attr=True) + crs = tl.Unicode(allow_none=True, default_value=None).tag(attr=True) + cf_time = tl.Bool(default_value=False).tag(attr=True) + cf_units = tl.Unicode(allow_none=True, default_value=None).tag(attr=True) + cf_calendar = tl.Unicode(allow_none=True, default_value=None).tag(attr=True) + skip_validation = tl.Bool(False).tag(attr=True) + + @property + def _repr_keys(self): + """ list of attribute names, used by __repr__ and __str__ to display minimal info about the node""" + keys = ["source", "interpolation"] + if len(self.available_data_keys) > 1 and not isinstance(self.data_key, list): + keys.append("data_key") + return keys + + @tl.default("data_key") + def _default_data_key(self): + if len(self.available_data_keys) == 1: + return self.available_data_keys[0] + else: + return self.available_data_keys + + @tl.validate("data_key") + def _validate_data_key(self, d): + keys = d["value"] + if self.skip_validation: + return keys + if not isinstance(keys, list): + keys = [d["value"]] + for key in keys: + if key not in self.available_data_keys: + raise ValueError("Invalid data_key '%s', available keys are %s" % (key, self.available_data_keys)) + return d["value"] + + @tl.default("outputs") + def _default_outputs(self): + if not isinstance(self.data_key, list): + return None + else: + return self.data_key + + @tl.validate("outputs") + def _validate_outputs(self, d): + value = d["value"] + if self.skip_validation: + return value + if not isinstance(self.data_key, list): + if value is not None: + raise TypeError("outputs must be None for single-output nodes") + else: + if value is None: + raise TypeError("outputs and data_key mismatch (outputs=None, data_key=%s)" % self.data_key) + if len(value) != len(self.data_key): + raise ValueError("outputs and data_key size mismatch (%d != %d)" % (len(value), len(self.data_key))) + return value + + # ------------------------------------------------------------------------- + # public api properties and methods + # ------------------------------------------------------------------------- + + @property + def keys(self): + raise NotImplementedError + + @property + def dims(self): + raise NotImplementedError + + @cached_property + def available_data_keys(self): + """available data keys""" + dim_keys = [self.lat_key, self.lon_key, self.alt_key, self.time_key] + keys = [key for key in self.keys if key not in dim_keys] + if len(keys) == 0: + raise ValueError("No data keys found in '%s'" % self.source) + return keys + + def _lookup_key(self, dim): + lookup = {"lat": self.lat_key, "lon": self.lon_key, "alt": self.alt_key, "time": self.time_key} + return lookup[dim] + + @common_doc(COMMON_DATA_DOC) + def get_coordinates(self): + """{get_coordinates} + """ + + cs = [self.dataset[self._lookup_key(dim)] for dim in self.dims] + if self.cf_time and "time" in self.dims: + time_ind = self.dims.index("time") + cs[time_ind] = xr.coding.times.decode_cf_datetime(cs[time_ind], self.cf_units, self.cf_calendar) + return Coordinates(cs, dims=self.dims, crs=self.crs) diff --git a/podpac/core/data/h5py_source.py b/podpac/core/data/h5py_source.py new file mode 100644 index 000000000..a8f0266c5 --- /dev/null +++ b/podpac/core/data/h5py_source.py @@ -0,0 +1,118 @@ +import traitlets as tl + +from lazy_import import lazy_module, lazy_class + +h5py = lazy_module("h5py") + +from podpac.core.utils import common_doc, cached_property +from podpac.core.data.datasource import COMMON_DATA_DOC, DATA_DOC +from podpac.core.data.file_source import BaseFileSource, FileKeysMixin + + +@common_doc(COMMON_DATA_DOC) +class H5PY(FileKeysMixin, BaseFileSource): + """Create a DataSource node using h5py. + + Attributes + ---------- + source : str + Path to the h5py file + dataset : h5py.File + The h5py file object used to read the file + coordinates : Coordinates + {coordinates} + file_mode : str, optional + Default is 'r'. The mode used to open the HDF5 file. Options are r, r+, w, w- or x, a (see h5py.File). + data_key : str, int + data key, default 'data' + lat_key : str, int + latitude coordinates key, default 'lat' + lon_key : str, int + longitude coordinates key, default 'lon' + time_key : str, int + time coordinates key, default 'time' + alt_key : str, int + altitude coordinates key, default 'alt', + array_dims : list of str + dataset dims, default ['lat', 'lon', 'alt', time'], for each _key defined + crs : str + Coordinate reference system of the coordinates + cf_time : bool + decode CF datetimes + cf_units : str + units, when decoding CF datetimes + cf_calendar : str + calendar, when decoding CF datetimes + """ + + file_mode = tl.Unicode(default_value="r").tag(readonly=True) + array_dims = tl.List(trait=tl.Unicode()).tag(readonly=True) + + @cached_property + def dataset(self): + return h5py.File(self.source, self.file_mode) + + def close_dataset(self): + """Closes the file. """ + self.dataset.close() + + # ------------------------------------------------------------------------- + # public api methods + # ------------------------------------------------------------------------- + + @cached_property + def dims(self): + """ dataset coordinate dims """ + try: + if not isinstance(self.data_key, list): + key = self.data_key + else: + key = self.data_key[0] + return self.dataset[key].attrs["_ARRAY_DIMENSIONS"] + except: + lookup = {self.lat_key: "lat", self.lon_key: "lon", self.alt_key: "alt", self.time_key: "time"} + + # make sure array_dim key is in self.keys + if self.array_dims: + inv_lookup = {v: k for k, v in lookup.items()} + return [key for key in self.array_dims if inv_lookup[key] in self.keys] + else: + return [lookup[key] for key in self.keys if key in lookup] + + @cached_property + def keys(self): + return H5PY._find_h5py_keys(self.dataset) + + @common_doc(COMMON_DATA_DOC) + def get_data(self, coordinates, coordinates_index): + """{get_data} + """ + data = self.create_output_array(coordinates) + if not isinstance(self.data_key, list): + data[:] = self.dataset[self.data_key][coordinates_index] + else: + for key, name in zip(self.data_key, self.outputs): + data.sel(output=name)[:] = self.dataset[key][coordinates_index] + return data + + # ------------------------------------------------------------------------- + # additional methods and properties + # ------------------------------------------------------------------------- + + def dataset_attrs(self, key="/"): + """Dataset or group key for which attributes will be summarized. + """ + return dict(self.dataset[key].attrs) + + @staticmethod + def _find_h5py_keys(obj, keys=[]): + # recursively find keys + + if isinstance(obj, (h5py.Group, h5py.File)): + for k in obj.keys(): + keys = H5PY._find_h5py_keys(obj[k], keys) + else: + keys.append(obj.name) + return keys + keys = sorted(list(set(keys))) + return keys diff --git a/podpac/core/data/ogc.py b/podpac/core/data/ogc.py index 6252b5b89..6f2114bad 100644 --- a/podpac/core/data/ogc.py +++ b/podpac/core/data/ogc.py @@ -12,7 +12,7 @@ import traitlets as tl from podpac.core.settings import settings -from podpac.core.utils import common_doc +from podpac.core.utils import common_doc, cached_property from podpac.core.data.datasource import COMMON_DATA_DOC, DataSource from podpac.core.coordinates import Coordinates, UniformCoordinates1d, ArrayCoordinates1d @@ -51,13 +51,13 @@ class WCS(DataSource): The coordinates of the WCS source """ - source = tl.Unicode().tag(readonly=True) - wcs_coordinates = tl.Instance(Coordinates).tag(readonly=True) # default below - - # node attrs + source = tl.Unicode().tag(attr=True) layer_name = tl.Unicode().tag(attr=True) - version = tl.Unicode(WCS_DEFAULT_VERSION).tag(attr=True) - crs = tl.Unicode(WCS_DEFAULT_CRS).tag(attr=True) + version = tl.Unicode(default_value=WCS_DEFAULT_VERSION).tag(attr=True) + crs = tl.Unicode(default_value=WCS_DEFAULT_CRS).tag(attr=True) + + # list of attribute names, used by __repr__ and __str__ to display minimal info about the node + _repr_keys = ["source", "interpolation"] _get_capabilities_qs = tl.Unicode("SERVICE=WCS&REQUEST=DescribeCoverage&" "VERSION={version}&COVERAGE={layer}") _get_data_qs = tl.Unicode( @@ -67,9 +67,8 @@ class WCS(DataSource): "WIDTH={width}&HEIGHT={height}&TIME={time}" ) - # TODO: This should be capabilities_url, not get_ @property - def get_capabilities_url(self): + def capabilities_url(self): """Constructs the url that requests the WCS capabilities Returns @@ -77,16 +76,16 @@ def get_capabilities_url(self): str The url that requests the WCS capabilities """ + return self.source + "?" + self._get_capabilities_qs.format(version=self.version, layer=self.layer_name) - @tl.default("wcs_coordinates") - def get_wcs_coordinates(self): - """Retrieves the native coordinates reported by the WCS service. + @cached_property + def wcs_coordinates(self): + """ Coordinates reported by the WCS service. Returns ------- Coordinates - The native coordinates reported by the WCS service. Notes ------- @@ -97,8 +96,9 @@ def get_wcs_coordinates(self): Exception Raises this if the required dependencies are not installed. """ + if requests is not None: - capabilities = requests.get(self.get_capabilities_url) + capabilities = requests.get(self.capabilities_url) if capabilities.status_code != 200: raise Exception("Could not get capabilities from WCS server") capabilities = capabilities.text @@ -110,10 +110,10 @@ def get_wcs_coordinates(self): else: http = urllib3.PoolManager() - r = http.request("GET", self.get_capabilities_url) + r = http.request("GET", self.capabilities_url) capabilities = r.data if r.status != 200: - raise Exception("Could not get capabilities from WCS server:" + self.get_capabilities_url) + raise Exception("Could not get capabilities from WCS server:" + self.capabilities_url) else: raise Exception("Do not have a URL request library to get WCS data.") @@ -166,15 +166,9 @@ def get_wcs_coordinates(self): ] ) - @property @common_doc(COMMON_DATA_DOC) - def native_coordinates(self): - """{native_coordinates} - - Returns - ------- - Coordinates - {native_coordinates} + def get_coordinates(self): + """{get_coordinates} Notes ------ @@ -269,7 +263,7 @@ def get_data(self, coordinates, coordinates_index): output.data[i, ...] = dataset.read() except Exception as e: # Probably python 2 print(e) - tmppath = os.path.join(settings["DISK_CACHE_DIR"], "wcs_temp.tiff") + tmppath = os.path.join(settings.cache_path, "wcs_temp.tiff") if not os.path.exists(os.path.split(tmppath)[0]): os.makedirs(os.path.split(tmppath)[0]) @@ -339,7 +333,7 @@ def get_data(self, coordinates, coordinates_index): output.data[:] = dataset.read() except Exception as e: # Probably python 2 print(e) - tmppath = os.path.join(settings["DISK_CACHE_DIR"], "wcs_temp.tiff") + tmppath = os.path.join(settings.cache_path, "wcs_temp.tiff") if not os.path.exists(os.path.split(tmppath)[0]): os.makedirs(os.path.split(tmppath)[0]) open(tmppath, "wb").write(content) @@ -365,11 +359,8 @@ def get_data(self, coordinates, coordinates_index): @property def base_ref(self): - """Summary - - Returns - ------- - TYPE - Description - """ + """ definition base_ref """ + if not self.layer_name: + return super(WCS, self).base_ref + return self.layer_name.rsplit(".", 1)[1] diff --git a/podpac/core/data/pydap_source.py b/podpac/core/data/pydap_source.py index e53220c18..80cb66e8b 100644 --- a/podpac/core/data/pydap_source.py +++ b/podpac/core/data/pydap_source.py @@ -4,15 +4,18 @@ from __future__ import division, unicode_literals, print_function, absolute_import +import logging + import numpy as np import traitlets as tl +import requests # Helper utility for optional imports from lazy_import import lazy_module, lazy_class # Internal dependencies from podpac.core import authentication -from podpac.core.utils import common_doc +from podpac.core.utils import common_doc, cached_property from podpac.core.data.datasource import COMMON_DATA_DOC, DataSource # Optional dependencies @@ -21,137 +24,88 @@ lazy_module("pydap.model") +_logger = logging.getLogger(__name__) + + @common_doc(COMMON_DATA_DOC) -class PyDAP(DataSource): +class PyDAP(authentication.RequestsSessionMixin, DataSource): """Create a DataSource from an OpenDAP server feed. Attributes ---------- - auth_class : :class:`podpac.authentication.Session` - :class:`requests.Session` derived class providing authentication credentials. - When username and password are provided, an auth_session is created using this class. - auth_session : :class:`podpac.authentication.Session` - Instance of the auth_class. This is created if username and password is supplied, but this object can also be - supplied directly - datakey : str + data_key : str Pydap 'key' for the data to be retrieved from the server. Datasource may have multiple keys, so this key determines which variable is returned from the source. dataset : pydap.model.DatasetType The open pydap dataset. This is provided for troubleshooting. - native_coordinates : Coordinates - {native_coordinates} - password : str, optional - Password used for authenticating against OpenDAP server. WARNING: this is stored as plain-text, provide - auth_session instead if you have security concerns. + coordinates : Coordinates + {coordinates} source : str URL of the OpenDAP server. - username : str, optional - Username used for authenticating against OpenDAP server. WARNING: this is stored as plain-text, provide - auth_session instead if you have security concerns. """ - source = tl.Unicode().tag(readonly=True) - dataset = tl.Instance("pydap.model.DatasetType").tag(readonly=True) - - # node attrs - datakey = tl.Unicode().tag(attr=True) - - # optional inputs - auth_class = tl.Type(authentication.Session) - auth_session = tl.Instance(authentication.Session, allow_none=True) - username = tl.Unicode(default_value=None, allow_none=True) - password = tl.Unicode(default_value=None, allow_none=True) - - @tl.default("auth_session") - def _auth_session_default(self): + source = tl.Unicode().tag(attr=True) + data_key = tl.Unicode().tag(attr=True) - # requires username and password - if not self.username or not self.password: - return None + # list of attribute names, used by __repr__ and __str__ to display minimal info about the node + _repr_keys = ["source", "interpolation"] - # requires auth_class - # TODO: default auth_class? - if not self.auth_class: - return None - - # instantiate and check utl + # hostname for RequestsSession is source. Try parsing off netloc + @tl.default("hostname") + def _hostname(self): try: - session = self.auth_class(username=self.username, password=self.password) - session.get(self.source + ".dds") + return requests.utils.urlparse(self.source).netloc except: - # TODO: catch a 403 error - return None - - return session + return self.source - @tl.default("dataset") - def _open_dataset(self): - """Summary - - Parameters - ---------- - source : str, optional - Description + @common_doc(COMMON_DATA_DOC) + def get_coordinates(self): + """{get_coordinates} - Returns - ------- - TYPE - Description + Raises + ------ + NotImplementedError + PyDAP cannot create coordinates. A child class must implement this method. """ + raise NotImplementedError("PyDAP cannot create coordinates. A child class must implement this method.") + @cached_property + def dataset(self): # auth session - # if self.auth_session: try: - dataset = self._open_url() + return self._open_url() except Exception: # TODO handle a 403 error # TODO: Check Url (probably inefficient...) try: - self.auth_session.get(self.source + ".dds") - dataset = self._open_url() + self.session.get(self.source + ".dds") + return self._open_url() except Exception: # TODO: handle 403 error - print("Warning, dataset could not be opened. Check login credentials.") - dataset = None - - return dataset + _logger.exception("Error opening PyDap url '%s'" % self.source) + raise RuntimeError("Could not open PyDap url '%s'.\nCheck login credentials." % self.source) def _open_url(self): - return pydap.client.open_url(self.source, session=self.auth_session) - - @common_doc(COMMON_DATA_DOC) - def get_native_coordinates(self): - """{get_native_coordinates} - - Raises - ------ - NotImplementedError - DAP has no mechanism for creating coordinates automatically, so this is left up to child classes. - """ - raise NotImplementedError( - "DAP has no mechanism for creating coordinates" - + ", so this is left up to child class " - + "implementations." - ) + return pydap.client.open_url(self.source, session=self.session) @common_doc(COMMON_DATA_DOC) def get_data(self, coordinates, coordinates_index): """{get_data} """ - data = self.dataset[self.datakey][tuple(coordinates_index)] + data = self.dataset[self.data_key][tuple(coordinates_index)] # PyDAP 3.2.1 gives a numpy array for the above, whereas 3.2.2 needs the .data attribute to get a numpy array if not isinstance(data, np.ndarray) and hasattr(data, "data"): data = data.data d = self.create_output_array(coordinates, data=data.reshape(coordinates.shape)) return d - @property + @cached_property def keys(self): """The list of available keys from the OpenDAP dataset. Returns ------- List - The list of available keys from the OpenDAP dataset. Any of these keys can be set as self.datakey + The list of available keys from the OpenDAP dataset. Any of these keys can be set as self.data_key """ return self.dataset.keys() diff --git a/podpac/core/data/rasterio_source.py b/podpac/core/data/rasterio_source.py new file mode 100644 index 000000000..c3494a476 --- /dev/null +++ b/podpac/core/data/rasterio_source.py @@ -0,0 +1,181 @@ +from __future__ import division, unicode_literals, print_function, absolute_import + +from collections import OrderedDict +import io + +from six import string_types +import traitlets as tl +import numpy as np +import pyproj + +from lazy_import import lazy_module + +rasterio = lazy_module("rasterio") + +from podpac.core.utils import common_doc, cached_property +from podpac.core.coordinates import UniformCoordinates1d, Coordinates +from podpac.core.data.datasource import COMMON_DATA_DOC, DATA_DOC +from podpac.core.data.file_source import BaseFileSource, LoadFileMixin + + +@common_doc(COMMON_DATA_DOC) +class Rasterio(LoadFileMixin, BaseFileSource): + """Create a DataSource using rasterio. + + Attributes + ---------- + source : str, :class:`io.BytesIO` + Path to the data source + dataset : :class:`rasterio._io.RasterReader` + A reference to the datasource opened by rasterio + coordinates : :class:`podpac.Coordinates` + {coordinates} + band : int + The 'band' or index for the variable being accessed in files such as GeoTIFFs. Use None for all bounds. + crs : str, optional + The coordinate reference system. Normally this will come directly from the file, but this allows users to + specify the crs in case this information is missing from the file. + """ + + # dataset = tl.Instance(rasterio.DatasetReader).tag(readonly=True) + band = tl.CInt(allow_none=True).tag(attr=True) + crs = tl.Unicode(allow_none=True, default_value=None).tag(attr=True) + driver = tl.Unicode(allow_none=True, default_value=None) + + @tl.default("band") + def _band_default(self): + if self.outputs is not None and self.output is not None: + return self.outputs.index(self.output) + elif self.outputs is None: + return 1 + else: + return None # All bands + + # ------------------------------------------------------------------------- + # public api methods + # ------------------------------------------------------------------------- + + @cached_property + def nan_vals(self): + return list(self.dataset.nodatavals) + + def open_dataset(self, fp, **kwargs): + with rasterio.MemoryFile() as mf: + mf.write(fp.read()) + return mf.open(driver=self.driver) + + def close_dataset(self): + """Closes the file for the datasource + """ + self.dataset.close() + + @common_doc(COMMON_DATA_DOC) + def get_coordinates(self): + """{get_coordinates} + + The default implementation tries to find the lat/lon coordinates based on dataset.affine. + It cannot determine the alt or time dimensions, so child classes may + have to overload this method. + """ + + # check to see if the coordinates are rotated used affine + affine = self.dataset.transform + + if self.crs is not None: + crs = self.crs + elif isinstance(self.dataset.crs, rasterio.crs.CRS) and "init" in self.dataset.crs: + crs = self.dataset.crs["init"].upper() + elif isinstance(self.dataset.crs, dict) and "init" in self.dataset.crs: + crs = self.dataset.crs["init"].upper() + else: + try: + crs = pyproj.CRS(self.dataset.crs).to_wkt() + except pyproj.exceptions.CRSError: + raise RuntimeError("Unexpected rasterio crs '%s'" % self.dataset.crs) + + return Coordinates.from_geotransform(affine.to_gdal(), self.dataset.shape, crs) + + @common_doc(COMMON_DATA_DOC) + def get_data(self, coordinates, coordinates_index): + """{get_data} + """ + data = self.create_output_array(coordinates) + slc = coordinates_index + + # read data within coordinates_index window + window = ((slc[0].start, slc[0].stop), (slc[1].start, slc[1].stop)) + + if self.outputs is not None: # read all the bands + raster_data = self.dataset.read(out_shape=(len(self.outputs),) + tuple(coordinates.shape), window=window) + raster_data = np.moveaxis(raster_data, 0, 2) + else: # read the requested band + raster_data = self.dataset.read(self.band, out_shape=tuple(coordinates.shape), window=window) + + # set raster data to output array + data.data.ravel()[:] = raster_data.ravel() + return data + + # ------------------------------------------------------------------------- + # additional methods and properties + # ------------------------------------------------------------------------- + + @property + def band_count(self): + """The number of bands""" + + return self.dataset.count + + @cached_property + def band_descriptions(self): + """ A description of each band contained in dataset.tags + + Returns + ------- + OrderedDict + Dictionary of band_number: band_description pairs. The band_description values are a dictionary, each + containing a number of keys -- depending on the metadata + """ + + return OrderedDict((i, self.dataset.tags(i + 1)) for i in range(self.band_count)) + + @cached_property + def band_keys(self): + """An alternative view of band_descriptions based on the keys present in the metadata + + Returns + ------- + dict + Dictionary of metadata keys, where the values are the value of the key for each band. + For example, band_keys['TIME'] = ['2015', '2016', '2017'] for a dataset with three bands. + """ + + keys = {k for i in range(self.band_count) for k in self.band_descriptions[i]} # set + return {k: [self.band_descriptions[i].get(k) for i in range(self.band_count)] for k in keys} + + def get_band_numbers(self, key, value): + """Return the bands that have a key equal to a specified value. + + Parameters + ---------- + key : str / list + Key present in the metadata of the band. Can be a single key, or a list of keys. + value : str / list + Value of the key that should be returned. Can be a single value, or a list of values + + Returns + ------- + np.ndarray + An array of band numbers that match the criteria + """ + if not hasattr(key, "__iter__") or isinstance(key, string_types): + key = [key] + + if not hasattr(value, "__iter__") or isinstance(value, string_types): + value = [value] + + match = np.ones(self.band_count, bool) + for k, v in zip(key, value): + match = match & (np.array(self.band_keys[k]) == v) + matches = np.where(match)[0] + 1 + + return matches diff --git a/podpac/core/data/reprojection.py b/podpac/core/data/reprojection.py index df1bd3252..238877685 100644 --- a/podpac/core/data/reprojection.py +++ b/podpac/core/data/reprojection.py @@ -1,14 +1,18 @@ from __future__ import division, unicode_literals, print_function, absolute_import -from six import string_types +import logging +import copy +from six import string_types import traitlets as tl -from podpac.core.utils import common_doc, NodeTrait +from podpac.core.utils import common_doc, NodeTrait, cached_property from podpac.core.coordinates import Coordinates from podpac.core.node import Node +from podpac.core.interpolation.interpolation import InterpolationTrait from podpac.core.data.datasource import COMMON_DATA_DOC, DataSource -from podpac.core.data.interpolation import interpolation_trait + +_logger = logging.getLogger(__name__) class ReprojectedSource(DataSource): @@ -25,12 +29,13 @@ class ReprojectedSource(DataSource): Coordinates where the source node should be evaluated. """ - source = NodeTrait().tag(readonly=True) - - # node attrs - source_interpolation = interpolation_trait().tag(attr=True) + source = NodeTrait().tag(attr=True) + source_interpolation = InterpolationTrait().tag(attr=True) reprojected_coordinates = tl.Instance(Coordinates).tag(attr=True) + # list of attribute names, used by __repr__ and __str__ to display minimal info about the node + _repr_keys = ["source", "interpolation"] + def _first_init(self, **kwargs): if "reprojected_coordinates" in kwargs: if isinstance(kwargs["reprojected_coordinates"], dict): @@ -40,34 +45,49 @@ def _first_init(self, **kwargs): return super(ReprojectedSource, self)._first_init(**kwargs) + @cached_property + def eval_source(self): + if self.source_interpolation is not None and not self.source.has_trait("interpolation"): + _logger.warning( + "ReprojectedSource cannot set the 'source_interpolation'" + " since 'source' does not have an 'interpolation' " + " trait. \n type(source): %s\nsource: %s" % (str(type(self.source)), str(self.source)) + ) + + source = self.source + if ( + self.source_interpolation is not None + and self.source.has_trait("interpolation") + and self.source_interpolation != self.source.interpolation + ): + source = copy.deepcopy(source) + source.set_trait("interpolation", self.source_interpolation) + + return source + @common_doc(COMMON_DATA_DOC) - def get_native_coordinates(self): - """{get_native_coordinates} + def get_coordinates(self): + """{get_coordinates} """ - if isinstance(self.source, DataSource): - sc = self.source.native_coordinates - else: # Otherwise we cannot guarantee that native_coordinates exist - sc = self.reprojected_coordinates + + # cannot guarantee that coordinates exist + if not isinstance(self.source, DataSource): + return self.reprojected_coordinates + + sc = self.source.coordinates rc = self.reprojected_coordinates - coords = [rc[dim] if dim in rc.dims else sc[dim] for dim in sc.dims] - return Coordinates(coords) + return Coordinates( + [rc[dim] if dim in rc.dims else self.source.coordinates[dim] for dim in self.source.coordinates.dims], + validate_crs=False, + ) @common_doc(COMMON_DATA_DOC) def get_data(self, coordinates, coordinates_index): """{get_data} """ - if hasattr(self.source, "interpolation") and self.source_interpolation is not None: - si = self.source.interpolation - self.source.interpolation = self.source_interpolation - elif self.source_interpolation is not None: - _logger.warning( - "ReprojectedSource cannot set the 'source_interpolation'" - " since self.source does not have an 'interpolation' " - " attribute. \n type(self.source): %s\nself.source: " % (str(type(self.source)), str(self.source)) - ) - data = self.source.eval(coordinates) - if hasattr(self.source, "interpolation") and self.source_interpolation is not None: - self.source.interpolation = si + + data = self.eval_source.eval(coordinates) + # The following is needed in case the source is an algorithm # or compositor node that doesn't have all the dimensions of # the reprojected coordinates @@ -79,11 +99,4 @@ def get_data(self, coordinates, coordinates_index): @property def base_ref(self): - """Summary - - Returns - ------- - TYPE - Description - """ return "{}_reprojected".format(self.source.base_ref) diff --git a/podpac/core/data/test/assets/points-no-data.csv b/podpac/core/data/test/assets/points-no-data.csv new file mode 100644 index 000000000..b15cf818a --- /dev/null +++ b/podpac/core/data/test/assets/points-no-data.csv @@ -0,0 +1,6 @@ +lat,lon,time,altitude +0,0,2018-01-01T12:00:00,0 +1,0,2018-01-01T12:00:00,0 +1,2,2018-01-01T12:00:00,0 +1,2,2018-01-01T12:00:03,0 +1,2,2018-01-01T12:00:03,4 \ No newline at end of file diff --git a/podpac/core/data/test/assets/points-one-dim.csv b/podpac/core/data/test/assets/points-one-dim.csv new file mode 100644 index 000000000..eb463c791 --- /dev/null +++ b/podpac/core/data/test/assets/points-one-dim.csv @@ -0,0 +1,6 @@ +time,data +2018-01-01T12:00:00,0 +2018-01-02T12:00:00,1 +2018-01-03T12:00:00,2 +2018-01-04T12:00:03,3 +2018-01-05T12:00:03,4 \ No newline at end of file diff --git a/podpac/core/data/test/test_array.py b/podpac/core/data/test/test_array.py index 87eeb6184..52a320578 100644 --- a/podpac/core/data/test/test_array.py +++ b/podpac/core/data/test/test_array.py @@ -1,4 +1,5 @@ import numpy as np +import traitlets as tl import pytest from podpac.core.coordinates import Coordinates, clinspace @@ -13,33 +14,30 @@ class TestArray(object): data = np.random.rand(11, 11) coordinates = Coordinates([clinspace(-25, 25, 11), clinspace(-25, 25, 11)], dims=["lat", "lon"]) - def test_source_trait(self): - """ must be an ndarray """ - - node = Array(source=self.data, native_coordinates=self.coordinates) + def test_data_array(self): + node = Array(source=self.data, coordinates=self.coordinates) + def test_data_list(self): # list is coercable to array - node = Array(source=[0, 1, 1], native_coordinates=self.coordinates) + node = Array(source=[0, 1, 1], coordinates=self.coordinates) - # this list is not coercable to array - # Starting with numpy 0.16, this is now allowed! - # with pytest.raises(TraitError): - # node = Array(source=[0, [0, 1]], native_coordinates=self.coordinates) + def test_invalid_data(self): + with pytest.raises(ValueError, match="Array 'source' data must be numerical"): + node = Array(source=["a", "b"], coordinates=self.coordinates) def test_get_data(self): """ defined get_data function""" - source = self.data - node = Array(source=source, native_coordinates=self.coordinates) + node = Array(source=self.data, coordinates=self.coordinates) output = node.eval(self.coordinates) assert isinstance(output, UnitsDataArray) - assert output.values[0, 0] == source[0, 0] - assert output.values[4, 5] == source[4, 5] + assert output.values[0, 0] == self.data[0, 0] + assert output.values[4, 5] == self.data[4, 5] def test_get_data_multiple(self): data = np.random.rand(11, 11, 2) - node = Array(source=data, native_coordinates=self.coordinates, outputs=["a", "b"]) + node = Array(source=data, coordinates=self.coordinates, outputs=["a", "b"]) output = node.eval(self.coordinates) assert isinstance(output, UnitsDataArray) assert output.dims == ("lat", "lon", "output") @@ -47,43 +45,20 @@ def test_get_data_multiple(self): np.testing.assert_array_equal(output.sel(output="a"), data[:, :, 0]) np.testing.assert_array_equal(output.sel(output="b"), data[:, :, 1]) - node = Array(source=data, native_coordinates=self.coordinates, outputs=["a", "b"], output="b") + node = Array(source=data, coordinates=self.coordinates, outputs=["a", "b"], output="b") output = node.eval(self.coordinates) assert isinstance(output, UnitsDataArray) assert output.dims == ("lat", "lon") np.testing.assert_array_equal(output, data[:, :, 1]) - def test_native_coordinates(self): - """test that native coordinates get defined""" + def test_coordinates(self): + node = Array(source=self.data, coordinates=self.coordinates) + assert node.coordinates node = Array(source=self.data) - with pytest.raises(NotImplementedError): - node.get_native_coordinates() - - node = Array(source=self.data, native_coordinates=self.coordinates) - assert node.native_coordinates - - node = Array(source=self.data, native_coordinates=self.coordinates) - native_coordinates = node.native_coordinates - get_native_coordinates = node.get_native_coordinates() - assert native_coordinates - assert get_native_coordinates - assert native_coordinates == get_native_coordinates - - def test_base_definition(self): - node = Array(source=self.data, native_coordinates=self.coordinates) - d = node.base_definition - source = np.array(d["source"]) - np.testing.assert_array_equal(source, self.data) - - def test_definition(self): - node = Array(source=self.data, native_coordinates=self.coordinates) - node2 = Node.from_definition(node.definition) - assert isinstance(node2, Array) - np.testing.assert_array_equal(node2.source, self.data) + with pytest.raises(tl.TraitError): + node.coordinates - def test_json(self): - node = Array(source=self.data, native_coordinates=self.coordinates) - node2 = Node.from_json(node.json) - assert isinstance(node2, Array) - np.testing.assert_array_equal(node2.source, self.data) + def test_no_cache(self): + node = Array() + assert len(node.cache_ctrl._cache_stores) == 0 diff --git a/podpac/core/data/test/test_base_dataset_source.py b/podpac/core/data/test/test_base_dataset_source.py deleted file mode 100644 index a81ae6ece..000000000 --- a/podpac/core/data/test/test_base_dataset_source.py +++ /dev/null @@ -1,209 +0,0 @@ -import numpy as np -import pytest - -from podpac.core.data.file import DatasetSource - -LAT = [0, 1, 2] -LON = [10, 20] -TIME = [100, 200] -ALT = [1, 2, 3, 4] -DATA = np.arange(48).reshape((3, 2, 2, 4)) -OTHER = 2 * np.arange(48).reshape((3, 2, 2, 4)) - - -class MockDatasetSourceSingle(DatasetSource): - source = "mock-single" - dataset = {"lat": LAT, "lon": LON, "time": TIME, "alt": ALT, "data": DATA} - dims = ["lat", "lon", "time", "alt"] - available_keys = ["data"] - - -class MockDatasetSourceMultiple(DatasetSource): - source = "mock-multiple" - dataset = {"lat": LAT, "lon": LON, "time": TIME, "alt": ALT, "data": DATA, "other": OTHER} - dims = ["lat", "lon", "time", "alt"] - available_keys = ["data", "other"] - - -class TestDatasetSource(object): - def test_not_implemented(self): - with pytest.raises(NotImplementedError): - node = DatasetSource() - - def test_init(self): - node = MockDatasetSourceSingle() - node = MockDatasetSourceMultiple() - - def test_close(self): - node = MockDatasetSourceSingle() - node.close_dataset() - - def test_data_key_and_output_keys(self): - # cannot both be defined - with pytest.raises(TypeError, match=".* cannot have both"): - node = MockDatasetSourceSingle(data_key="data", output_keys=["data"]) - - # make a standard single-output node for datasets with a single non-dimension key - node = MockDatasetSourceSingle() - assert node.data_key == "data" - assert node.output_keys is None - assert node.outputs is None - - # make a multi-output node for datasets with multiple non-dimension keys - node = MockDatasetSourceMultiple() - assert node.data_key is None - assert node.output_keys == ["data", "other"] - assert node.outputs == ["data", "other"] - - def test_outputs(self): - # standard single-output nodes have no "outputs" - node = MockDatasetSourceSingle(data_key="data") - assert node.outputs == None - - node = MockDatasetSourceMultiple(data_key="data") - assert node.outputs == None - - # for multi-output nodes, use the dataset's keys (output_keys) by default - node = MockDatasetSourceSingle(output_keys=["data"]) - assert node.outputs == ["data"] - - node = MockDatasetSourceMultiple(output_keys=["data", "other"]) - assert node.outputs == ["data", "other"] - - node = MockDatasetSourceMultiple(output_keys=["data"]) - assert node.outputs == ["data"] - - # alternate outputs names can be specified - node = MockDatasetSourceSingle(output_keys=["data"], outputs=["a"]) - assert node.outputs == ["a"] - - node = MockDatasetSourceMultiple(output_keys=["data", "other"], outputs=["a", "b"]) - assert node.outputs == ["a", "b"] - - node = MockDatasetSourceMultiple(output_keys=["data"], outputs=["a"]) - assert node.outputs == ["a"] - - node = MockDatasetSourceMultiple(outputs=["a", "b"]) - assert node.outputs == ["a", "b"] - - # but the outputs and output_keys must match - with pytest.raises(ValueError, match="outputs and output_keys size mismatch"): - node = MockDatasetSourceMultiple(output_keys=["data"], outputs=["a", "b"]) - - with pytest.raises(ValueError, match="outputs and output_keys size mismatch"): - node = MockDatasetSourceMultiple(output_keys=["data", "other"], outputs=["a"]) - - with pytest.raises(ValueError, match="outputs and output_keys size mismatch"): - node = MockDatasetSourceMultiple(outputs=["a"]) - - # and outputs cannot be provided for single-output nodes - with pytest.raises(TypeError, match="outputs must be None for single-output nodes"): - node = MockDatasetSourceSingle(data_key="data", outputs=["a"]) - - with pytest.raises(TypeError, match="outputs must be None for single-output nodes"): - node = MockDatasetSourceMultiple(data_key="data", outputs=["a"]) - - with pytest.raises(TypeError, match="outputs must be None for single-output nodes"): - node = MockDatasetSourceSingle(outputs=["a"]) - - def test_output(self): - with pytest.raises(TypeError, match="Invalid output"): - node = MockDatasetSourceSingle(data_key="data", output="data") - - with pytest.raises(ValueError, match="Invalid output"): - node = MockDatasetSourceMultiple(outputs=["a", "b"], output="other") - - def test_native_coordinates(self): - node = MockDatasetSourceSingle() - nc = node.native_coordinates - assert nc.dims == ("lat", "lon", "time", "alt") - np.testing.assert_array_equal(nc["lat"].coordinates, LAT) - np.testing.assert_array_equal(nc["lon"].coordinates, LON) - np.testing.assert_array_equal(nc["time"].coordinates, TIME) - np.testing.assert_array_equal(nc["alt"].coordinates, ALT) - - def test_base_definition(self): - node = MockDatasetSourceSingle() - d = node.base_definition - assert "attrs" in d - assert "lat_key" in d["attrs"] - assert "lon_key" in d["attrs"] - assert "alt_key" in d["attrs"] - assert "time_key" in d["attrs"] - assert "data_key" in d["attrs"] - assert "output_keys" not in d["attrs"] - assert "outputs" not in d["attrs"] - assert "cf_time" not in d["attrs"] - assert "cf_units" not in d["attrs"] - assert "cf_calendar" not in d["attrs"] - assert "crs" not in d["attrs"] - - node = MockDatasetSourceMultiple() - d = node.base_definition - assert "attrs" in d - assert "lat_key" in d["attrs"] - assert "lon_key" in d["attrs"] - assert "alt_key" in d["attrs"] - assert "time_key" in d["attrs"] - assert "output_keys" in d["attrs"] - assert "data_key" not in d["attrs"] - assert "outputs" not in d["attrs"] - assert "cf_time" not in d["attrs"] - assert "cf_units" not in d["attrs"] - assert "cf_calendar" not in d["attrs"] - assert "crs" not in d["attrs"] - - node = MockDatasetSourceMultiple(outputs=["a", "b"]) - d = node.base_definition - assert "attrs" in d - assert "lat_key" in d["attrs"] - assert "lon_key" in d["attrs"] - assert "alt_key" in d["attrs"] - assert "time_key" in d["attrs"] - assert "output_keys" in d["attrs"] - assert "outputs" in d["attrs"] - assert "data_key" not in d["attrs"] - assert "cf_time" not in d["attrs"] - assert "cf_units" not in d["attrs"] - assert "cf_calendar" not in d["attrs"] - assert "crs" not in d["attrs"] - - class MockDatasetSource1(DatasetSource): - source = "temp" - dims = ["lat", "lon"] - available_keys = ["data"] - - node = MockDatasetSource1(crs="EPSG::3294") - d = node.base_definition - assert "attrs" in d - assert "lat_key" in d["attrs"] - assert "lon_key" in d["attrs"] - assert "data_key" in d["attrs"] - assert "crs" in d["attrs"] - assert "time_key" not in d["attrs"] - assert "alt_key" not in d["attrs"] - assert "output_keys" not in d["attrs"] - assert "outputs" not in d["attrs"] - assert "cf_time" not in d["attrs"] - assert "cf_units" not in d["attrs"] - assert "cf_calendar" not in d["attrs"] - - class MockDatasetSource2(DatasetSource): - source = "temp" - dims = ["time", "alt"] - available_keys = ["data"] - - node = MockDatasetSource2(cf_time=True) - d = node.base_definition - assert "attrs" in d - assert "alt_key" in d["attrs"] - assert "time_key" in d["attrs"] - assert "data_key" in d["attrs"] - assert "cf_time" in d["attrs"] - assert "cf_units" in d["attrs"] - assert "cf_calendar" in d["attrs"] - assert "crs" not in d["attrs"] - assert "lat_key" not in d["attrs"] - assert "lon_key" not in d["attrs"] - assert "output_keys" not in d["attrs"] - assert "outputs" not in d["attrs"] diff --git a/podpac/core/data/test/test_csv.py b/podpac/core/data/test/test_csv.py index b8d6026bc..ba6b56bb3 100644 --- a/podpac/core/data/test/test_csv.py +++ b/podpac/core/data/test/test_csv.py @@ -3,7 +3,7 @@ import pytest import numpy as np -from podpac.core.data.file import CSV +from podpac.core.data.csv_source import CSV class TestCSV(object): @@ -13,6 +13,8 @@ class TestCSV(object): source_single = os.path.join(os.path.dirname(__file__), "assets/points-single.csv") source_multiple = os.path.join(os.path.dirname(__file__), "assets/points-multiple.csv") source_no_header = os.path.join(os.path.dirname(__file__), "assets/points-no-header.csv") + source_one_dim = os.path.join(os.path.dirname(__file__), "assets/points-one-dim.csv") + source_no_data = os.path.join(os.path.dirname(__file__), "assets/points-no-data.csv") lat = [0, 1, 1, 1, 1] lon = [0, 0, 2, 2, 2] @@ -30,68 +32,146 @@ class TestCSV(object): data = [0, 1, 2, 3, 4] other = [10.5, 20.5, 30.5, 40.5, 50.5] - # def test_init(self): - # node = CSV(source=self.source_single, alt_key="altitude", crs="+proj=merc +vunits=m") + def test_init(self): + node = CSV(source=self.source_single, alt_key="altitude", crs="+proj=merc +vunits=m") + + def test_close(self): + node = CSV(source=self.source_single, alt_key="altitude", crs="+proj=merc +vunits=m") + + def test_get_dims(self): + node = CSV(source=self.source_single, alt_key="altitude", crs="+proj=merc +vunits=m") + assert node.dims == ["lat", "lon", "time", "alt"] + + node = CSV(source=self.source_multiple, alt_key="altitude", crs="+proj=merc +vunits=m") + assert node.dims == ["lat", "lon", "time", "alt"] - # def test_close(self): - # node = CSV(source=self.source_single, alt_key="altitude", crs="+proj=merc +vunits=m") - # node.close_dataset() + def test_available_data_keys(self): + node = CSV(source=self.source_single, alt_key="altitude", crs="+proj=merc +vunits=m") + assert node.available_data_keys == ["data"] + + node = CSV(source=self.source_multiple, alt_key="altitude", crs="+proj=merc +vunits=m") + assert node.available_data_keys == ["data", "other"] + + node = CSV(source=self.source_no_data, alt_key="altitude", crs="+proj=merc +vunits=m") + with pytest.raises(ValueError, match="No data keys found"): + node.available_data_keys + + def test_data_key(self): + # default + node = CSV(source=self.source_single, alt_key="altitude", crs="+proj=merc +vunits=m") + assert node.data_key == "data" + + # specify + node = CSV(source=self.source_single, data_key="data", alt_key="altitude", crs="+proj=merc +vunits=m") + assert node.data_key == "data" + + # invalid + with pytest.raises(ValueError, match="Invalid data_key"): + node = CSV(source=self.source_single, data_key="misc", alt_key="altitude", crs="+proj=merc +vunits=m") + + def test_data_key_col(self): + # specify column + node = CSV(source=self.source_single, data_key=4, alt_key="altitude", crs="+proj=merc +vunits=m") + assert node.data_key == 4 + + # invalid (out of range) + with pytest.raises(ValueError, match="Invalid data_key"): + node = CSV(source=self.source_single, data_key=5, alt_key="altitude", crs="+proj=merc +vunits=m") + + # invalid (dimension key) + with pytest.raises(ValueError, match="Invalid data_key"): + node = CSV(source=self.source_single, data_key=0, alt_key="altitude", crs="+proj=merc +vunits=m") + + def test_data_key_multiple_outputs(self): + # default + node = CSV(source=self.source_multiple, alt_key="altitude", crs="+proj=merc +vunits=m") + assert node.data_key == ["data", "other"] + + # specify multiple + node = CSV( + source=self.source_multiple, data_key=["other", "data"], alt_key="altitude", crs="+proj=merc +vunits=m" + ) + assert node.data_key == ["other", "data"] - # def test_get_dims(self): - # node = CSV(source=self.source_single, alt_key="altitude", crs="+proj=merc +vunits=m") - # assert node.dims == ["lat", "lon", "time", "alt"] + # specify one + node = CSV(source=self.source_multiple, data_key="other", alt_key="altitude", crs="+proj=merc +vunits=m") + assert node.data_key == "other" - # node = CSV(source=self.source_multiple, alt_key="altitude", crs="+proj=merc +vunits=m") - # assert node.dims == ["lat", "lon", "time", "alt"] + # specify multiple: invalid item + with pytest.raises(ValueError, match="Invalid data_key"): + node = CSV( + source=self.source_multiple, data_key=["data", "misc"], alt_key="altitude", crs="+proj=merc +vunits=m" + ) - # def test_available_keys(self): - # node = CSV(source=self.source_single, alt_key="altitude", crs="+proj=merc +vunits=m") - # assert node.available_keys == ["data"] + # specify one: invalid + with pytest.raises(ValueError, match="Invalid data_key"): + node = CSV(source=self.source_multiple, data_key="misc", alt_key="altitude", crs="+proj=merc +vunits=m") - # node = CSV(source=self.source_multiple, alt_key="altitude", crs="+proj=merc +vunits=m") - # assert node.available_keys == ["data", "other"] + def test_data_key_col_multiple_outputs(self): + # specify multiple + node = CSV(source=self.source_multiple, data_key=[4, 5], alt_key="altitude", crs="+proj=merc +vunits=m") + assert node.data_key == [4, 5] + assert node.outputs == ["data", "other"] - # def test_native_coordinates(self): - # node = CSV(source=self.source_single, alt_key="altitude", crs="+proj=merc +vunits=m") - # nc = node.native_coordinates - # assert nc.dims == ("lat_lon_time_alt",) - # np.testing.assert_array_equal(nc["lat"].coordinates, self.lat) - # np.testing.assert_array_equal(nc["lon"].coordinates, self.lon) - # np.testing.assert_array_equal(nc["time"].coordinates, self.time) - # np.testing.assert_array_equal(nc["alt"].coordinates, self.alt) + # specify one + node = CSV(source=self.source_multiple, data_key=4, alt_key="altitude", crs="+proj=merc +vunits=m") + + assert node.data_key == 4 + assert node.outputs is None + + # specify multiple: invalid item + with pytest.raises(ValueError, match="Invalid data_key"): + node = CSV(source=self.source_multiple, data_key=[4, 6], alt_key="altitude", crs="+proj=merc +vunits=m") + + # specify one: invalid with pytest.raises(ValueError, match="Invalid data_key"): + node = CSV(source=self.source_multiple, data_key=6, alt_key="altitude", crs="+proj=merc +vunits=m") + + def test_coordinates(self): + node = CSV(source=self.source_single, alt_key="altitude", crs="+proj=merc +vunits=m") + nc = node.coordinates + assert nc.dims == ("lat_lon_time_alt",) + np.testing.assert_array_equal(nc["lat"].coordinates, self.lat) + np.testing.assert_array_equal(nc["lon"].coordinates, self.lon) + np.testing.assert_array_equal(nc["time"].coordinates, self.time) + np.testing.assert_array_equal(nc["alt"].coordinates, self.alt) + + # one dim (unstacked) + node = CSV(source=self.source_one_dim) + nc = node.coordinates + assert nc.dims == ("time",) def test_get_data(self): node = CSV(source=self.source_single, alt_key="altitude", data_key="data", crs="+proj=merc +vunits=m") - out = node.eval(node.native_coordinates) + out = node.eval(node.coordinates) np.testing.assert_array_equal(out, self.data) node = CSV(source=self.source_multiple, alt_key="altitude", data_key="data", crs="+proj=merc +vunits=m") - out = node.eval(node.native_coordinates) + out = node.eval(node.coordinates) np.testing.assert_array_equal(out, self.data) node = CSV(source=self.source_multiple, alt_key="altitude", data_key="other", crs="+proj=merc +vunits=m") - out = node.eval(node.native_coordinates) + out = node.eval(node.coordinates) np.testing.assert_array_equal(out, self.other) # default node = CSV(source=self.source_single, alt_key="altitude", crs="+proj=merc +vunits=m") - out = node.eval(node.native_coordinates) + out = node.eval(node.coordinates) np.testing.assert_array_equal(out, self.data) def test_get_data_multiple(self): # multiple data keys node = CSV( - source=self.source_multiple, alt_key="altitude", output_keys=["data", "other"], crs="+proj=merc +vunits=m" + source=self.source_multiple, alt_key="altitude", data_key=["data", "other"], crs="+proj=merc +vunits=m" ) - out = node.eval(node.native_coordinates) + out = node.eval(node.coordinates) assert out.dims == ("lat_lon_time_alt", "output") np.testing.assert_array_equal(out["output"], ["data", "other"]) np.testing.assert_array_equal(out.sel(output="data"), self.data) np.testing.assert_array_equal(out.sel(output="other"), self.other) # single data key - node = CSV(source=self.source_multiple, alt_key="altitude", output_keys=["data"], crs="+proj=merc +vunits=m") - out = node.eval(node.native_coordinates) + node = CSV(source=self.source_multiple, alt_key="altitude", data_key=["data"], crs="+proj=merc +vunits=m") + out = node.eval(node.coordinates) assert out.dims == ("lat_lon_time_alt", "output") np.testing.assert_array_equal(out["output"], ["data"]) np.testing.assert_array_equal(out.sel(output="data"), self.data) @@ -100,11 +180,11 @@ def test_get_data_multiple(self): node = CSV( source=self.source_multiple, alt_key="altitude", - output_keys=["data", "other"], + data_key=["data", "other"], outputs=["a", "b"], crs="+proj=merc +vunits=m", ) - out = node.eval(node.native_coordinates) + out = node.eval(node.coordinates) assert out.dims == ("lat_lon_time_alt", "output") np.testing.assert_array_equal(out["output"], ["a", "b"]) np.testing.assert_array_equal(out.sel(output="a"), self.data) @@ -112,7 +192,7 @@ def test_get_data_multiple(self): # default node = CSV(source=self.source_multiple, alt_key="altitude", crs="+proj=merc +vunits=m") - out = node.eval(node.native_coordinates) + out = node.eval(node.coordinates) assert out.dims == ("lat_lon_time_alt", "output") np.testing.assert_array_equal(out["output"], ["data", "other"]) np.testing.assert_array_equal(out.sel(output="data"), self.data) @@ -129,8 +209,8 @@ def test_cols(self): crs="+proj=merc +vunits=m", ) - # native_coordinates - nc = node.native_coordinates + # coordinates + nc = node.coordinates assert nc.dims == ("lat_lon_time_alt",) np.testing.assert_array_equal(nc["lat"].coordinates, self.lat) np.testing.assert_array_equal(nc["lon"].coordinates, self.lon) @@ -148,13 +228,13 @@ def test_cols_multiple(self): lon_key=1, time_key=2, alt_key=3, - output_keys=[4, 5], + data_key=[4, 5], outputs=["a", "b"], crs="+proj=merc +vunits=m", ) # native coordinantes - nc = node.native_coordinates + nc = node.coordinates assert nc.dims == ("lat_lon_time_alt",) np.testing.assert_array_equal(nc["lat"].coordinates, self.lat) np.testing.assert_array_equal(nc["lon"].coordinates, self.lon) @@ -181,7 +261,7 @@ def test_header(self): ) # native coordinantes - nc = node.native_coordinates + nc = node.coordinates assert nc.dims == ("lat_lon_time_alt",) np.testing.assert_array_equal(nc["lat"].coordinates, self.lat) np.testing.assert_array_equal(nc["lon"].coordinates, self.lon) @@ -191,23 +271,3 @@ def test_header(self): # eval out = node.eval(nc) np.testing.assert_array_equal(out, self.data) - - def test_base_definition(self): - node = CSV(source=self.source_single, alt_key="altitude", crs="+proj=merc +vunits=m") - d = node.base_definition - if "attrs" in d: - assert "header" not in d["attrs"] - - node = CSV( - source=self.source_no_header, - lat_key=0, - lon_key=1, - time_key=2, - alt_key=3, - data_key=4, - header=None, - crs="+proj=merc +vunits=m", - ) - d = node.base_definition - assert "attrs" in d - assert "header" in d["attrs"] diff --git a/podpac/core/data/test/test_dataset.py b/podpac/core/data/test/test_dataset.py index 53cf82847..2b5f985ce 100644 --- a/podpac/core/data/test/test_dataset.py +++ b/podpac/core/data/test/test_dataset.py @@ -3,7 +3,7 @@ import pytest -from podpac.core.data.file import Dataset +from podpac.core.data.dataset_source import Dataset class TestDataset(object): @@ -26,17 +26,18 @@ def test_dims(self): assert node.dims == ["time", "lat", "lon"] # un-mapped keys - with pytest.raises(ValueError, match="Unexpected dimension"): - node = Dataset(source=self.source) + # node = Dataset(source=self.source) + # with pytest.raises(ValueError, match="Unexpected dimension"): + # node.dims - def test_available_keys(self): + def test_available_data_keys(self): node = Dataset(source=self.source, time_key="day") - assert node.available_keys == ["data", "other"] + assert node.available_data_keys == ["data", "other"] - def test_native_coordinates(self): + def test_coordinates(self): # specify dimension keys node = Dataset(source=self.source, time_key="day") - nc = node.native_coordinates + nc = node.coordinates assert nc.dims == ("time", "lat", "lon") np.testing.assert_array_equal(nc["lat"].coordinates, self.lat) np.testing.assert_array_equal(nc["lon"].coordinates, self.lon) @@ -46,18 +47,18 @@ def test_native_coordinates(self): def test_get_data(self): # specify data key node = Dataset(source=self.source, time_key="day", data_key="data") - out = node.eval(node.native_coordinates) + out = node.eval(node.coordinates) np.testing.assert_array_equal(out, self.data) node.close_dataset() node = Dataset(source=self.source, time_key="day", data_key="other") - out = node.eval(node.native_coordinates) + out = node.eval(node.coordinates) np.testing.assert_array_equal(out, self.other) node.close_dataset() - def test_get_data_multilpe(self): - node = Dataset(source=self.source, time_key="day", output_keys=["data", "other"]) - out = node.eval(node.native_coordinates) + def test_get_data_multiple(self): + node = Dataset(source=self.source, time_key="day", data_key=["data", "other"]) + out = node.eval(node.coordinates) assert out.dims == ("time", "lat", "lon", "output") np.testing.assert_array_equal(out["output"], ["data", "other"]) np.testing.assert_array_equal(out.sel(output="data"), self.data) @@ -65,16 +66,16 @@ def test_get_data_multilpe(self): node.close_dataset() # single - node = Dataset(source=self.source, time_key="day", output_keys=["other"]) - out = node.eval(node.native_coordinates) + node = Dataset(source=self.source, time_key="day", data_key=["other"]) + out = node.eval(node.coordinates) assert out.dims == ("time", "lat", "lon", "output") np.testing.assert_array_equal(out["output"], ["other"]) np.testing.assert_array_equal(out.sel(output="other"), self.other) node.close_dataset() # alternate output names - node = Dataset(source=self.source, time_key="day", output_keys=["data", "other"], outputs=["a", "b"]) - out = node.eval(node.native_coordinates) + node = Dataset(source=self.source, time_key="day", data_key=["data", "other"], outputs=["a", "b"]) + out = node.eval(node.coordinates) assert out.dims == ("time", "lat", "lon", "output") np.testing.assert_array_equal(out["output"], ["a", "b"]) np.testing.assert_array_equal(out.sel(output="a"), self.data) @@ -83,7 +84,7 @@ def test_get_data_multilpe(self): # default node = Dataset(source=self.source, time_key="day") - out = node.eval(node.native_coordinates) + out = node.eval(node.coordinates) assert out.dims == ("time", "lat", "lon", "output") np.testing.assert_array_equal(out["output"], ["data", "other"]) np.testing.assert_array_equal(out.sel(output="data"), self.data) @@ -91,20 +92,8 @@ def test_get_data_multilpe(self): node.close_dataset() def test_extra_dim(self): - # TODO - pass - - def test_base_definition(self): - node = Dataset(source=self.source, time_key="day") - d = node.base_definition - if "attrs" in d: - assert "extra_dim" not in d["attrs"] - node.close_dataset() + # default + node = Dataset(source=self.source) + assert node.extra_dim is None - node = Dataset( - source=self.source, time_key="day", extra_dim={"channel": 1} - ) # TODO actually use source with extra_dim - d = node.base_definition - assert "attrs" in d - assert "extra_dim" in d["attrs"] - node.close_dataset() + # TODO diff --git a/podpac/core/data/test/test_datasource.py b/podpac/core/data/test/test_datasource.py index 50d5205c1..5a5a9290e 100644 --- a/podpac/core/data/test/test_datasource.py +++ b/podpac/core/data/test/test_datasource.py @@ -15,46 +15,33 @@ from podpac.core.node import COMMON_NODE_DOC, NodeException from podpac.core.style import Style from podpac.core.coordinates import Coordinates, clinspace, crange +from podpac.core.interpolation.interpolation import Interpolation, Interpolator +from podpac.core.interpolation.interpolator import Interpolator from podpac.core.data.datasource import DataSource, COMMON_DATA_DOC, DATA_DOC -from podpac.core.data.interpolation import Interpolation -from podpac.core.data.interpolator import Interpolator - - -class MockArrayDataSource(DataSource): - def get_data(self, coordinates, coordinates_index): - return self.create_output_array(coordinates, data=self.source[coordinates_index]) class MockDataSource(DataSource): - data = np.ones((101, 101)) + data = np.ones((11, 11)) data[0, 0] = 10 data[0, 1] = 1 data[1, 0] = 5 data[1, 1] = None - def get_native_coordinates(self): - return Coordinates([clinspace(-25, 25, 101), clinspace(-25, 25, 101)], dims=["lat", "lon"]) + def get_coordinates(self): + return Coordinates([clinspace(-25, 25, 11), clinspace(-25, 25, 11)], dims=["lat", "lon"]) def get_data(self, coordinates, coordinates_index): return self.create_output_array(coordinates, data=self.data[coordinates_index]) -class MockNonuniformDataSource(DataSource): - """ Mock Data Source for testing that is non-uniform """ - - # mock 3 x 3 grid of random values - source = np.random.rand(3, 3) - native_coordinates = Coordinates([[-10, -2, -1], [4, 32, 1]], dims=["lat", "lon"]) +class MockDataSourceStacked(DataSource): + data = np.arange(11) - def get_native_coordinates(self): - """ """ - return self.native_coordinates + def get_coordinates(self): + return Coordinates([clinspace((-25, -25), (25, 25), 11)], dims=["lat_lon"]) def get_data(self, coordinates, coordinates_index): - """ """ - s = coordinates_index - d = self.create_output_array(coordinates, data=self.source[s]) - return d + return self.create_output_array(coordinates, data=self.data[coordinates_index]) class TestDataDocs(object): @@ -78,116 +65,119 @@ class TestDataSource(object): def test_init(self): node = DataSource() - def test_nomethods_must_be_implemented(self): + def test_get_data_not_implemented(self): node = DataSource() with pytest.raises(NotImplementedError): - node.get_native_coordinates() + node.get_data(None, None) + def test_get_coordinates_not_implemented(self): + node = DataSource() with pytest.raises(NotImplementedError): - node.get_data(None, None) + node.get_coordinates() - def test_set_native_coordinates(self): - nc = Coordinates([clinspace(0, 50, 101), clinspace(0, 50, 101)], dims=["lat", "lon"]) - node = DataSource(source="test", native_coordinates=nc) - assert node.native_coordinates is not None + def test_coordinates(self): + # not implemented + node = DataSource() + with pytest.raises(NotImplementedError): + node.coordinates - with pytest.raises(tl.TraitError): - DataSource(source="test", native_coordinates="not a coordinate") + # use get_coordinates (once) + class MyDataSource(DataSource): + get_coordinates_called = 0 - with pytest.raises(NotImplementedError): - DataSource(source="test").native_coordinates + def get_coordinates(self): + self.get_coordinates_called += 1 + return Coordinates([]) - def test_get_native_coordinates(self): - # get_native_coordinates should set the native_coordinates by default + node = MyDataSource() + assert node.get_coordinates_called == 0 + assert isinstance(node.coordinates, Coordinates) + assert node.get_coordinates_called == 1 + assert isinstance(node.coordinates, Coordinates) + assert node.get_coordinates_called == 1 + + # can't set + with pytest.raises(AttributeError, match="can't set attribute"): + node.coordinates = Coordinates([]) + + def test_cache_coordinates(self): + class MyDataSource(DataSource): + get_coordinates_called = 0 + + def get_coordinates(self): + self.get_coordinates_called += 1 + return Coordinates([]) + + a = MyDataSource(cache_coordinates=True, cache_ctrl=["ram"]) + b = MyDataSource(cache_coordinates=True, cache_ctrl=["ram"]) + c = MyDataSource(cache_coordinates=False, cache_ctrl=["ram"]) + d = MyDataSource(cache_coordinates=True, cache_ctrl=[]) + + a.rem_cache("*") + b.rem_cache("*") + c.rem_cache("*") + d.rem_cache("*") + + # get_coordinates called once + assert not a.has_cache("coordinates") + assert a.get_coordinates_called == 0 + assert isinstance(a.coordinates, Coordinates) + assert a.get_coordinates_called == 1 + assert isinstance(a.coordinates, Coordinates) + assert a.get_coordinates_called == 1 + + # coordinates is cached to a, b, and c + assert a.has_cache("coordinates") + assert b.has_cache("coordinates") + assert c.has_cache("coordinates") + assert not d.has_cache("coordinates") + + # b: use cache, get_coordinates not called + assert b.get_coordinates_called == 0 + assert isinstance(b.coordinates, Coordinates) + assert b.get_coordinates_called == 0 + + # c: don't use cache, get_coordinates called + assert c.get_coordinates_called == 0 + assert isinstance(c.coordinates, Coordinates) + assert c.get_coordinates_called == 1 + + # d: use cache but there is no ram cache for this node, get_coordinates is called + assert d.get_coordinates_called == 0 + assert isinstance(d.coordinates, Coordinates) + assert d.get_coordinates_called == 1 + + def test_set_coordinates(self): node = MockDataSource() - assert node.native_coordinates is not None - np.testing.assert_equal(node.native_coordinates["lat"].coordinates, np.linspace(-25, 25, 101)) - np.testing.assert_equal(node.native_coordinates["lon"].coordinates, np.linspace(-25, 25, 101)) + node.set_coordinates(Coordinates([])) + assert node.coordinates == Coordinates([]) + assert node.coordinates != node.get_coordinates() - # but don't call get_native_coordinates if the native_coordinates are set explicitly - nc = Coordinates([clinspace(0, 50, 101), clinspace(0, 50, 101)], dims=["lat", "lon"]) - node = MockDataSource(native_coordinates=nc) - assert node.native_coordinates is not None - np.testing.assert_equal(node.native_coordinates["lat"].coordinates, nc["lat"].coordinates) - np.testing.assert_equal(node.native_coordinates["lat"].coordinates, nc["lat"].coordinates) + # don't overwrite + node = MockDataSource() + node.coordinates + node.set_coordinates(Coordinates([])) + assert node.coordinates != Coordinates([]) + assert node.coordinates == node.get_coordinates() def test_invalid_interpolation(self): with pytest.raises(tl.TraitError): - MockDataSource(interpolation="myowninterp") + DataSource(interpolation="myowninterp") def test_invalid_nan_vals(self): with pytest.raises(tl.TraitError): - MockDataSource(nan_vals={}) + DataSource(nan_vals={}) with pytest.raises(tl.TraitError): - MockDataSource(nan_vals=10) - - def test_base_definition(self): - """Test definition property method""" - - # TODO: add interpolation definition testing - - node = DataSource(source="test") - d = node.base_definition - assert d - assert "node" in d - assert "source" in d - assert "lookup_source" not in d - assert "interpolation" in d - assert d["source"] == node.source - if "attrs" in d: - assert "nan_vals" not in d["attrs"] - - # keep nan_vals - node = DataSource(source="test", nan_vals=[-999]) - d = node.base_definition - assert "attrs" in d - assert "nan_vals" in d["attrs"] - - # array source - node2 = DataSource(source=np.array([1, 2, 3])) - d = node2.base_definition - assert "source" in d - assert isinstance(d["source"], list) - assert d["source"] == [1, 2, 3] - - # lookup source - node3 = DataSource(source=node) - d = node3.base_definition - assert "source" not in d - assert "lookup_source" in d - - # cannot tag source or interpolation as attr - class MyDataSource1(DataSource): - source = tl.Unicode().tag(attr=True) - - node = MyDataSource1(source="test") - with pytest.raises(NodeException, match="The 'source' property cannot be tagged as an 'attr'"): - node.base_definition - - class MyDataSource2(DataSource): - interpolation = tl.Unicode().tag(attr=True) - - node = MyDataSource2(source="test") - with pytest.raises(NodeException, match="The 'interpolation' property cannot be tagged as an 'attr'"): - node.base_definition + DataSource(nan_vals=10) def test_repr(self): - node = DataSource(source="test", native_coordinates=Coordinates([0, 1], dims=["lat", "lon"])) - repr(node) - - node = DataSource(source="test", native_coordinates=Coordinates([[0, 1]], dims=["lat_lon"])) - repr(node) - - class MyDataSource(DataSource): - pass - - node = MyDataSource(source="test") + node = DataSource() repr(node) def test_interpolation_class(self): - node = DataSource(source="test", interpolation="max") + node = DataSource(interpolation="max") assert node.interpolation_class assert isinstance(node.interpolation_class, Interpolation) assert node.interpolation_class.definition == "max" @@ -196,7 +186,7 @@ def test_interpolation_class(self): def test_interpolators(self): node = MockDataSource() - node.eval(node.native_coordinates) + node.eval(node.coordinates) assert isinstance(node.interpolators, OrderedDict) @@ -209,17 +199,17 @@ def test_interpolators(self): assert "lon" in list(node.interpolators.keys())[0] assert isinstance(list(node.interpolators.values())[0], Interpolator) - def test_evaluate_at_native_coordinates(self): - """evaluate node at native coordinates""" + def test_evaluate_at_coordinates(self): + """evaluate node at coordinates""" node = MockDataSource() - output = node.eval(node.native_coordinates) + output = node.eval(node.coordinates) assert isinstance(output, UnitsDataArray) - assert output.shape == (101, 101) + assert output.shape == (11, 11) assert output[0, 0] == 10 - assert output.lat.shape == (101,) - assert output.lon.shape == (101,) + assert output.lat.shape == (11,) + assert output.lon.shape == (11,) # assert coordinates assert isinstance(output.coords, DataArrayCoordinates) @@ -325,39 +315,37 @@ def test_evaluate_with_crs_transform(self): assert round(out.coords["lon"].values[0]) == 1928929.0 def test_evaluate_extra_dims(self): - # drop extra dimension - node = MockArrayDataSource( - source=np.empty((3, 2)), - native_coordinates=Coordinates([[0, 1, 2], [10, 11]], dims=["lat", "lon"]), - interpolation="nearest_preview", - ) + # drop extra unstacked dimension + class MyDataSource(DataSource): + coordinates = Coordinates([1, 11], dims=["lat", "lon"]) - output = node.eval(Coordinates([1, 11, "2018-01-01"], dims=["lat", "lon", "time"])) + def get_data(self, coordinates, coordinates_index): + return self.create_output_array(coordinates) + + node = MyDataSource() + coords = Coordinates([1, 11, "2018-01-01"], dims=["lat", "lon", "time"]) + output = node.eval(coords) assert output.dims == ("lat", "lon") # time dropped # drop extra stacked dimension if none of its dimensions are needed - node = MockArrayDataSource( - source=np.empty((2)), - native_coordinates=Coordinates([["2018-01-01", "2018-01-02"]], dims=["time"]), - interpolation="nearest_preview", - ) + class MyDataSource(DataSource): + coordinates = Coordinates(["2018-01-01"], dims=["time"]) + + def get_data(self, coordinates, coordinates_index): + return self.create_output_array(coordinates) - output = node.eval(Coordinates([[1, 11], "2018-01-01"], dims=["lat_lon", "time"])) + node = MyDataSource() + coords = Coordinates([[1, 11], "2018-01-01"], dims=["lat_lon", "time"]) + output = node.eval(coords) assert output.dims == ("time",) # lat_lon dropped - # don't drop extra stacked dimension if any of its dimensions are needed - # TODO interpolation is not yet implemented - # node = MockArrayDataSource( - # source=np.empty(3), - # native_coordinates=Coordinates([[0, 1, 2]], dims=['lat'])) - # output = node.eval(Coordinates([[1, 11]], dims=['lat_lon'])) - # assert output.dims == ('lat_lon') # lon portion not dropped + # but don't drop extra stacked dimension if any of its dimensions are needed + # output = node.eval(Coordinates([[1, 11, '2018-01-01']], dims=['lat_lon_time'])) + # assert output.dims == ('lat_lon_time') # lat and lon not dropped def test_evaluate_missing_dims(self): # missing unstacked dimension - node = MockArrayDataSource( - source=np.empty((3, 2)), native_coordinates=Coordinates([[0, 1, 2], [10, 11]], dims=["lat", "lon"]) - ) + node = MockDataSource() with pytest.raises(ValueError, match="Cannot evaluate these coordinates.*"): node.eval(Coordinates([1], dims=["lat"])) @@ -367,9 +355,7 @@ def test_evaluate_missing_dims(self): node.eval(Coordinates(["2018-01-01"], dims=["time"])) # missing any part of stacked dimension - node = MockArrayDataSource( - source=np.empty(3), native_coordinates=Coordinates([[[0, 1, 2], [10, 11, 12]]], dims=["lat_lon"]) - ) + node = MockDataSourceStacked() with pytest.raises(ValueError, match="Cannot evaluate these coordinates.*"): node.eval(Coordinates([1], dims=["time"])) @@ -387,27 +373,25 @@ def test_evaluate_no_overlap(self): assert np.all(np.isnan(output)) def test_evaluate_extract_output(self): - coords = Coordinates([[0, 1, 2, 3], [10, 11]], dims=["lat", "lon"]) - - class MockMultipleDataSource(DataSource): + class MyMultipleDataSource(DataSource): outputs = ["a", "b", "c"] - native_coordinates = coords + coordinates = Coordinates([[0, 1, 2, 3], [10, 11]], dims=["lat", "lon"]) def get_data(self, coordinates, coordinates_index): return self.create_output_array(coordinates, data=1) # don't extract when no output field is requested - node = MockMultipleDataSource() - o = node.eval(coords) + node = MyMultipleDataSource() + o = node.eval(node.coordinates) assert o.shape == (4, 2, 3) np.testing.assert_array_equal(o.dims, ["lat", "lon", "output"]) np.testing.assert_array_equal(o["output"], ["a", "b", "c"]) np.testing.assert_array_equal(o, 1) # do extract when an output field is requested - node = MockMultipleDataSource(output="b") + node = MyMultipleDataSource(output="b") - o = node.eval(coords) # get_data case + o = node.eval(node.coordinates) # get_data case assert o.shape == (4, 2) np.testing.assert_array_equal(o.dims, ["lat", "lon"]) np.testing.assert_array_equal(o, 1) @@ -418,16 +402,13 @@ def get_data(self, coordinates, coordinates_index): np.testing.assert_array_equal(o, np.nan) # should still work if the node has already extracted it - class MockMultipleDataSource2(DataSource): - outputs = ["a", "b", "c"] - native_coordinates = coords - + class MyMultipleDataSource2(MyMultipleDataSource): def get_data(self, coordinates, coordinates_index): out = self.create_output_array(coordinates, data=1) return out.sel(output=self.output) - node = MockMultipleDataSource2(output="b") - o = node.eval(coords) + node = MyMultipleDataSource2(output="b") + o = node.eval(node.coordinates) assert o.shape == (4, 2) np.testing.assert_array_equal(o.dims, ["lat", "lon"]) np.testing.assert_array_equal(o, 1) @@ -435,10 +416,26 @@ def get_data(self, coordinates, coordinates_index): def test_nan_vals(self): """ evaluate note with nan_vals """ - node = MockDataSource(nan_vals=[10, None]) - output = node.eval(node.native_coordinates) - - assert output.values[np.isnan(output)].shape == (2,) + # none + node = MockDataSource() + output = node.eval(node.coordinates) + assert np.sum(np.isnan(output)) == 1 + assert np.isnan(output[1, 1]) + + # one value + node = MockDataSource(nan_vals=[10]) + output = node.eval(node.coordinates) + assert np.sum(np.isnan(output)) == 2 + assert np.isnan(output[0, 0]) + assert np.isnan(output[1, 1]) + + # multiple values + node = MockDataSource(nan_vals=[10, 5]) + output = node.eval(node.coordinates) + assert np.sum(np.isnan(output)) == 3 + assert np.isnan(output[0, 0]) + assert np.isnan(output[1, 1]) + assert np.isnan(output[1, 0]) def test_get_data_np_array(self): class MockDataSourceReturnsArray(MockDataSource): @@ -446,10 +443,10 @@ def get_data(self, coordinates, coordinates_index): return self.data[coordinates_index] node = MockDataSourceReturnsArray() - output = node.eval(node.native_coordinates) + output = node.eval(node.coordinates) assert isinstance(output, UnitsDataArray) - assert node.native_coordinates["lat"].coordinates[4] == output.coords["lat"].values[4] + assert node.coordinates["lat"].coordinates[4] == output.coords["lat"].values[4] def test_get_data_DataArray(self): class MockDataSourceReturnsDataArray(MockDataSource): @@ -457,17 +454,17 @@ def get_data(self, coordinates, coordinates_index): return xr.DataArray(self.data[coordinates_index]) node = MockDataSourceReturnsDataArray() - output = node.eval(node.native_coordinates) + output = node.eval(node.coordinates) assert isinstance(output, UnitsDataArray) - assert node.native_coordinates["lat"].coordinates[4] == output.coords["lat"].values[4] + assert node.coordinates["lat"].coordinates[4] == output.coords["lat"].values[4] def test_find_coordinates(self): node = MockDataSource() l = node.find_coordinates() assert isinstance(l, list) assert len(l) == 1 - assert l[0] == node.native_coordinates + assert l[0] == node.coordinates class TestInterpolateData(object): @@ -481,15 +478,18 @@ def test_one_data_point(self): def test_interpolate_time(self): """ for now time uses nearest neighbor """ - source = np.random.rand(5) - coords_src = Coordinates([clinspace(0, 10, 5)], dims=["time"]) - coords_dst = Coordinates([clinspace(1, 11, 5)], dims=["time"]) + class MyDataSource(DataSource): + coordinates = Coordinates([clinspace(0, 10, 5)], dims=["time"]) + + def get_data(self, coordinates, coordinates_index): + return self.create_output_array(coordinates) - node = MockArrayDataSource(source=source, native_coordinates=coords_src) - output = node.eval(coords_dst) + node = MyDataSource() + coords = Coordinates([clinspace(1, 11, 5)], dims=["time"]) + output = node.eval(coords) assert isinstance(output, UnitsDataArray) - assert np.all(output.time.values == coords_dst.coords["time"]) + assert np.all(output.time.values == coords.coords["time"]) def test_interpolate_lat_time(self): """interpolate with n dims and time""" @@ -498,12 +498,16 @@ def test_interpolate_lat_time(self): def test_interpolate_alt(self): """ for now alt uses nearest neighbor """ - source = np.random.rand(5) - coords_src = Coordinates([clinspace(0, 10, 5)], dims=["alt"], crs="+proj=merc +vunits=m") - coords_dst = Coordinates([clinspace(1, 11, 5)], dims=["alt"], crs="+proj=merc +vunits=m") + class MyDataSource(DataSource): + coordinates = Coordinates([clinspace(0, 10, 5)], dims=["alt"], crs="+proj=merc +vunits=m") + + def get_data(self, coordinates, coordinates_index): + return self.create_output_array(coordinates) + + coords = Coordinates([clinspace(1, 11, 5)], dims=["alt"], crs="+proj=merc +vunits=m") - node = MockArrayDataSource(source=source, native_coordinates=coords_src) - output = node.eval(coords_dst) + node = MyDataSource() + output = node.eval(coords) assert isinstance(output, UnitsDataArray) - assert np.all(output.alt.values == coords_dst.coords["alt"]) + assert np.all(output.alt.values == coords.coords["alt"]) diff --git a/podpac/core/data/test/test_file_source.py b/podpac/core/data/test/test_file_source.py new file mode 100644 index 000000000..82e6f95c5 --- /dev/null +++ b/podpac/core/data/test/test_file_source.py @@ -0,0 +1,260 @@ +import os + +import numpy as np +import traitlets as tl +import pytest + +import podpac +from podpac.core.data.file_source import BaseFileSource +from podpac.core.data.file_source import LoadFileMixin +from podpac.core.data.file_source import FileKeysMixin + +LAT = [0, 1, 2] +LON = [10, 20] +TIME = [100, 200] +ALT = [1, 2, 3, 4] +DATA = np.arange(48).reshape((3, 2, 2, 4)) +OTHER = 2 * np.arange(48).reshape((3, 2, 2, 4)) + + +class TestBaseFileSource(object): + def test_source_required(self): + node = BaseFileSource() + with pytest.raises(ValueError, match="'source' required"): + node.source + + def test_dataset_not_implemented(self): + node = BaseFileSource(source="mysource") + with pytest.raises(NotImplementedError): + node.dataset + + def test_close(self): + node = BaseFileSource(source="mysource") + node.close_dataset() + + def test_repr_str(self): + node = BaseFileSource(source="mysource") + assert "source=" in repr(node) + assert "source=" in str(node) + + +# --------------------------------------------------------------------------------------------------------------------- +# LoadFileMixin +# --------------------------------------------------------------------------------------------------------------------- + + +class MockLoadFile(LoadFileMixin, BaseFileSource): + def open_dataset(self, f): + return None + + +class TestLoadFile(object): + def test_open_dataset_not_implemented(self): + node = LoadFileMixin() + with pytest.raises(NotImplementedError): + node.open_dataset(None) + + def test_local(self): + path = os.path.join(os.path.dirname(__file__), "assets/points-single.csv") + node = MockLoadFile(source=path) + node.dataset + + @pytest.mark.aws + def test_s3(self): + # TODO replace this with a better public s3 fileobj for testing + path = "s3://modis-pds/MCD43A4.006/00/08/2020018/MCD43A4.A2020018.h00v08.006.2020027031229_meta.json" + node = MockLoadFile(source=path) + node.dataset + + @pytest.mark.aws # TODO + def test_ftp(self): + node = MockLoadFile(source="ftp://speedtest.tele2.net/1KB.zip") + node.dataset + + @pytest.mark.aws # TODO + def test_http(self): + node = MockLoadFile(source="https://httpstat.us/200") + node.dataset + + def test_file(self): + path = os.path.join(os.path.dirname(__file__), "assets/points-single.csv") + node = MockLoadFile(source="file:///%s" % path) + node.dataset + + def test_cache_dataset(self): + path = os.path.join(os.path.dirname(__file__), "assets/points-single.csv") + + with podpac.settings: + podpac.settings["DEFAULT_CACHE"] = ["ram"] + node = MockLoadFile(source="file:///%s" % path) + node.dataset + + # node caches dataset object + assert node._dataset_caching_node.has_cache("dataset") + + # another node can get cached object + node2 = MockLoadFile(source="file:///%s" % path) + assert node2._dataset_caching_node.has_cache("dataset") + node2.dataset + + +# --------------------------------------------------------------------------------------------------------------------- +# FileKeysMixin +# --------------------------------------------------------------------------------------------------------------------- + + +class MockFileKeys(FileKeysMixin, BaseFileSource): + source = "mock-single" + dataset = {"lat": LAT, "lon": LON, "time": TIME, "alt": ALT, "data": DATA} + keys = ["lat", "lon", "time", "alt", "data"] + dims = ["lat", "lon", "time", "alt"] + + +class MockFileKeysMultipleAvailable(FileKeysMixin, BaseFileSource): + source = "mock-multiple" + dataset = {"lat": LAT, "lon": LON, "time": TIME, "alt": ALT, "data": DATA, "other": OTHER} + keys = ["lat", "lon", "time", "alt", "data", "other"] + dims = ["lat", "lon", "time", "alt"] + + +class MockFileKeysEmpty(FileKeysMixin, BaseFileSource): + source = "mock-empty" + dataset = {"lat": LAT, "lon": LON, "time": TIME, "alt": ALT} + keys = ["lat", "lon", "time", "alt"] + dims = ["lat", "lon", "time", "alt"] + + +class TestFileKeys(object): + def test_not_implemented(self): + class MySource(FileKeysMixin, BaseFileSource): + pass + + node = MySource(source="mysource") + + with pytest.raises(NotImplementedError): + node.keys + + with pytest.raises(NotImplementedError): + node.dims + + def test_available_data_keys(self): + node = MockFileKeys() + assert node.available_data_keys == ["data"] + + node = MockFileKeysMultipleAvailable() + assert node.available_data_keys == ["data", "other"] + + node = MockFileKeysEmpty() + with pytest.raises(ValueError, match="No data keys found"): + node.available_data_keys + + def test_data_key(self): + node = MockFileKeys() + assert node.data_key == "data" + + node = MockFileKeys(data_key="data") + assert node.data_key == "data" + + with pytest.raises(ValueError, match="Invalid data_key"): + node = MockFileKeys(data_key="misc") + + def test_data_key_multiple_outputs(self): + node = MockFileKeysMultipleAvailable() + assert node.data_key == ["data", "other"] + + node = MockFileKeysMultipleAvailable(data_key=["other", "data"]) + assert node.data_key == ["other", "data"] + + node = MockFileKeysMultipleAvailable(data_key="other") + assert node.data_key == "other" + + with pytest.raises(ValueError, match="Invalid data_key"): + node = MockFileKeysMultipleAvailable(data_key=["data", "misc"]) + + with pytest.raises(ValueError, match="Invalid data_key"): + node = MockFileKeysMultipleAvailable(data_key="misc") + + def test_no_outputs(self): + node = MockFileKeys(data_key="data") + assert node.outputs == None + + node = MockFileKeysMultipleAvailable(data_key="data") + assert node.outputs == None + + with pytest.raises(TypeError, match="outputs must be None for single-output nodes"): + node = MockFileKeys(data_key="data", outputs=["a"]) + + with pytest.raises(TypeError, match="outputs must be None for single-output nodes"): + node = MockFileKeysMultipleAvailable(data_key="data", outputs=["a"]) + + with pytest.raises(TypeError, match="outputs must be None for single-output nodes"): + node = MockFileKeys(outputs=["a"]) + + def test_outputs(self): + # for multi-output nodes, use the dataset's keys by default + node = MockFileKeys(data_key=["data"]) + assert node.outputs == ["data"] + + node = MockFileKeysMultipleAvailable(data_key=["data", "other"]) + assert node.outputs == ["data", "other"] + + node = MockFileKeysMultipleAvailable(data_key=["data"]) + assert node.outputs == ["data"] + + # alternate outputs names can be specified + node = MockFileKeys(data_key=["data"], outputs=["a"]) + assert node.outputs == ["a"] + + node = MockFileKeysMultipleAvailable(data_key=["data", "other"], outputs=["a", "b"]) + assert node.outputs == ["a", "b"] + + node = MockFileKeysMultipleAvailable(data_key=["data"], outputs=["a"]) + assert node.outputs == ["a"] + + node = MockFileKeysMultipleAvailable(outputs=["a", "b"]) + assert node.outputs == ["a", "b"] + + # but the outputs and data_key must match + with pytest.raises(TypeError, match="outputs and data_key mismatch"): + node = MockFileKeysMultipleAvailable(data_key=["data"], outputs=None) + + with pytest.raises(ValueError, match="outputs and data_key size mismatch"): + node = MockFileKeysMultipleAvailable(data_key=["data"], outputs=["a", "b"]) + + with pytest.raises(ValueError, match="outputs and data_key size mismatch"): + node = MockFileKeysMultipleAvailable(data_key=["data", "other"], outputs=["a"]) + + def test_coordinates(self): + node = MockFileKeys() + nc = node.coordinates + assert nc.dims == ("lat", "lon", "time", "alt") + np.testing.assert_array_equal(nc["lat"].coordinates, LAT) + np.testing.assert_array_equal(nc["lon"].coordinates, LON) + np.testing.assert_array_equal(nc["time"].coordinates, TIME) + np.testing.assert_array_equal(nc["alt"].coordinates, ALT) + + def test_repr_str(self): + node = MockFileKeys() + + assert "source=" in repr(node) + assert "data_key=" not in repr(node) + + assert "source=" in str(node) + assert "data_key=" not in str(node) + + def test_repr_str_multiple_outputs(self): + node = MockFileKeysMultipleAvailable() + + assert "source=" in repr(node) + assert "data_key=" not in repr(node) + + assert "source=" in str(node) + assert "data_key=" not in str(node) + + node = MockFileKeysMultipleAvailable(data_key="data") + + assert "source=" in repr(node) + assert "data_key=" in repr(node) + + assert "source=" in str(node) + assert "data_key=" in str(node) diff --git a/podpac/core/data/test/test_h5py.py b/podpac/core/data/test/test_h5py.py index c34ed3253..ebb76f736 100644 --- a/podpac/core/data/test/test_h5py.py +++ b/podpac/core/data/test/test_h5py.py @@ -2,7 +2,7 @@ import numpy as np -from podpac.core.data.file import H5PY +from podpac.core.data.h5py_source import H5PY class TestH5PY(object): @@ -17,51 +17,51 @@ def test_dims(self): assert node.dims == ["lat", "lon"] node.close_dataset() - def test_available_keys(self): + def test_available_data_keys(self): node = H5PY(source=self.source, data_key="/data/init", lat_key="/coords/lat", lon_key="/coords/lon") - assert node.available_keys == ["/data/init"] + assert node.available_data_keys == ["/data/init"] node.close_dataset() - def test_native_coordinates(self): + def test_coordinates(self): node = H5PY(source=self.source, data_key="/data/init", lat_key="/coords/lat", lon_key="/coords/lon") - nc = node.native_coordinates - assert node.native_coordinates.shape == (3, 4) - np.testing.assert_array_equal(node.native_coordinates["lat"].coordinates, [45.1, 45.2, 45.3]) - np.testing.assert_array_equal(node.native_coordinates["lon"].coordinates, [-100.1, -100.2, -100.3, -100.4]) + nc = node.coordinates + assert node.coordinates.shape == (3, 4) + np.testing.assert_array_equal(node.coordinates["lat"].coordinates, [45.1, 45.2, 45.3]) + np.testing.assert_array_equal(node.coordinates["lon"].coordinates, [-100.1, -100.2, -100.3, -100.4]) node.close_dataset() def test_data(self): node = H5PY(source=self.source, data_key="/data/init", lat_key="/coords/lat", lon_key="/coords/lon") - o = node.eval(node.native_coordinates) + o = node.eval(node.coordinates) np.testing.assert_array_equal(o.data.ravel(), np.arange(12)) node.close_dataset() # default node = H5PY(source=self.source, lat_key="/coords/lat", lon_key="/coords/lon") - o = node.eval(node.native_coordinates) + o = node.eval(node.coordinates) np.testing.assert_array_equal(o.data.ravel(), np.arange(12)) node.close_dataset() def test_data_multiple(self): node = H5PY( source=self.source, - output_keys=["/data/init", "/data/init"], + data_key=["/data/init", "/data/init"], outputs=["a", "b"], lat_key="/coords/lat", lon_key="/coords/lon", ) - o = node.eval(node.native_coordinates) + o = node.eval(node.coordinates) assert o.dims == ("lat", "lon", "output") np.testing.assert_array_equal(o["output"], ["a", "b"]) np.testing.assert_array_equal(o.sel(output="a").data.ravel(), np.arange(12)) np.testing.assert_array_equal(o.sel(output="b").data.ravel(), np.arange(12)) node.close_dataset() - def test_attrs(self): + def test_dataset_attrs(self): node = H5PY(source=self.source, data_key="/data/init", lat_key="/coords/lat", lon_key="/coords/lon") - assert node.attrs() == {} - assert node.attrs("data") == {"test": "test"} - assert node.attrs("coords/lat") == {"unit": "degrees"} - assert node.attrs("coords/lon") == {"unit": "degrees"} - assert node.attrs("coords") == {"crs": "EPSG:4326s"} + assert node.dataset_attrs() == {} + assert node.dataset_attrs("data") == {"test": "test"} + assert node.dataset_attrs("coords/lat") == {"unit": "degrees"} + assert node.dataset_attrs("coords/lon") == {"unit": "degrees"} + assert node.dataset_attrs("coords") == {"crs": "EPSG:4326s"} node.close_dataset() diff --git a/podpac/core/data/test/test_integration.py b/podpac/core/data/test/test_integration.py index 089746b99..f5337812d 100644 --- a/podpac/core/data/test/test_integration.py +++ b/podpac/core/data/test/test_integration.py @@ -29,7 +29,7 @@ def test_array(self): lat = np.random.rand(16) lon = np.random.rand(16) coord = Coordinate(lat_lon=(lat, lon), time=(0, 10, 11), order=["lat_lon", "time"]) - node = Array(source=arr, native_coordinates=coord) + node = Array(source=arr, coordinates=coord) coordg = Coordinate(lat=(0, 1, 8), lon=(0, 1, 8), order=("lat", "lon")) coordt = Coordinate(time=(3, 5, 2)) @@ -85,11 +85,11 @@ def setup_method(self, method): self.lonSource = LON self.timeSource = TIME - self.nasLat = Array(source=LAT.astype(float), native_coordinates=self.coord_src, interpolation="bilinear") + self.nasLat = Array(source=LAT.astype(float), coordinates=self.coord_src, interpolation="bilinear") - self.nasLon = Array(source=LON.astype(float), native_coordinates=self.coord_src, interpolation="bilinear") + self.nasLon = Array(source=LON.astype(float), coordinates=self.coord_src, interpolation="bilinear") - self.nasTime = Array(source=TIME.astype(float), native_coordinates=self.coord_src, interpolation="bilinear") + self.nasTime = Array(source=TIME.astype(float), coordinates=self.coord_src, interpolation="bilinear") def test_raster_to_raster(self): coord_dst = Coordinates([clinspace(5.0, 40.0, 50), clinspace(-68.0, -66.0, 100)], dims=["lat", "lon"]) diff --git a/podpac/core/data/test/test_pydap.py b/podpac/core/data/test/test_pydap.py index 5c037e0f6..d91cccef1 100644 --- a/podpac/core/data/test/test_pydap.py +++ b/podpac/core/data/test/test_pydap.py @@ -1,126 +1,98 @@ import pydap -import numpy as np import pytest -from traitlets import TraitError +import numpy as np +import traitlets as tl +import requests from podpac.core.coordinates import Coordinates, clinspace from podpac.core.units import UnitsDataArray +from podpac.core import authentication from podpac.core.data.pydap_source import PyDAP - -# Trying to fix test -pydap.client.open_url +from podpac import settings class MockPyDAP(PyDAP): """mock pydap data source """ source = "http://demo.opendap.org" - username = "username" - password = "password" - datakey = "key" + data_key = "key" + data = np.random.rand(11, 11) + + def get_coordinates(self): + return Coordinates([clinspace(-25, 25, 11), clinspace(-25, 25, 11)], dims=["lat", "lon"]) - def get_native_coordinates(self): - return self.native_coordinates + def _open_url(self): + base = pydap.model.BaseType(name="key", data=self.data) + dataset = pydap.model.DatasetType(name="dataset") + dataset["key"] = base + return dataset class TestPyDAP(object): """test pydap datasource""" source = "http://demo.opendap.org" - username = "username" - password = "password" - datakey = "key" - - # mock parameters and data - data = np.random.rand(11, 11) # mocked from pydap endpoint - coordinates = Coordinates([clinspace(-25, 25, 11), clinspace(-25, 25, 11)], dims=["lat", "lon"]) - - def mock_pydap(self): - def open_url(url, session=None): - base = pydap.model.BaseType(name="key", data=self.data) - dataset = pydap.model.DatasetType(name="dataset") - dataset["key"] = base - return dataset - - pydap.client.open_url = open_url + data_key = "key" def test_init(self): - """test basic init of class""" + node = PyDAP(source="mysource", data_key="key") - node = PyDAP(source=self.source, datakey=self.datakey, username=self.username, password=self.password) - assert isinstance(node, PyDAP) + def test_coordinates_not_implemented(self): + node = PyDAP(source="mysource", data_key="key") + with pytest.raises(NotImplementedError): + node.coordinates - node = MockPyDAP() - assert isinstance(node, MockPyDAP) - - def test_traits(self): - """ check each of the pydap traits """ - - with pytest.raises(TraitError): - PyDAP(source=5, datakey=self.datakey) + def test_keys(self): + """test return of dataset keys""" - with pytest.raises(TraitError): - PyDAP(source=self.source, datakey=5) + node = MockPyDAP() + keys = node.keys + assert "key" in keys - nodes = [PyDAP(source=self.source, datakey=self.datakey), MockPyDAP()] + def test_session(self): + """test session attribute and traitlet default """ - # TODO: in traitlets, if you already define variable, it won't enforce case on - # redefinition - with pytest.raises(TraitError): - nodes[0].username = 5 + # hostname should be the same as the source, parsed by request + node = PyDAP(source=self.source, data_key=self.data_key) + assert node.hostname == "demo.opendap.org" - with pytest.raises(TraitError): - nodes[0].password = 5 + # defaults to no auth required + assert node.auth_required == False - for node in nodes: - with pytest.raises(TraitError): - node.auth_class = "auth_class" + # session should be available + assert node.session + assert isinstance(node.session, requests.Session) - with pytest.raises(TraitError): - node.auth_session = "auth_class" + # auth required + with settings: + if "username@test.org" in settings: + del settings["username@test.org"] - with pytest.raises(TraitError): - node.dataset = [1, 2, 3] + if "password@test.org" in settings: + del settings["password@test.org"] - def test_auth_session(self): - """test auth_session attribute and traitlet default """ + node = PyDAP(source=self.source, data_key=self.data_key, hostname="test.org", auth_required=True) + assert node.hostname == "test.org" - # default to none if no username and password - node = PyDAP(source=self.source, datakey=self.datakey) - assert node.auth_session is None + # throw auth error + with pytest.raises(ValueError): + s = node.session - # default to none if no auth_class - node = PyDAP(source=self.source, datakey=self.datakey, username=self.username, password=self.password) - assert node.auth_session is None + node.set_credentials(username="user", password="pass") + assert node.session + assert isinstance(node.session, requests.Session) def test_dataset(self): - """test dataset trait """ - self.mock_pydap() - - node = PyDAP(source=self.source, datakey=self.datakey) + node = MockPyDAP() assert isinstance(node.dataset, pydap.model.DatasetType) + def test_url_error(self): + node = PyDAP(source="mysource") + with pytest.raises(RuntimeError): + node.dataset + def test_get_data(self): """test get_data function of pydap""" - self.mock_pydap() - - node = PyDAP(source=self.source, datakey=self.datakey, native_coordinates=self.coordinates) - output = node.eval(self.coordinates) - assert isinstance(output, UnitsDataArray) - assert output.values[0, 0] == self.data[0, 0] - - node = MockPyDAP(native_coordinates=self.coordinates) - output = node.eval(self.coordinates) - assert isinstance(output, UnitsDataArray) - - def test_native_coordinates(self): - """test native coordinates of pydap datasource""" - pass - - def test_keys(self): - """test return of dataset keys""" - self.mock_pydap() - - node = MockPyDAP(native_coordinates=self.coordinates) - keys = node.keys - assert "key" in keys + node = MockPyDAP() + output = node.eval(node.coordinates) + np.testing.assert_array_equal(output.values, node.data) diff --git a/podpac/core/data/test/test_rasterio.py b/podpac/core/data/test/test_rasterio.py index f76d3548a..4aec3da83 100644 --- a/podpac/core/data/test/test_rasterio.py +++ b/podpac/core/data/test/test_rasterio.py @@ -8,17 +8,7 @@ from podpac.core.coordinates import Coordinates from podpac.core.units import UnitsDataArray -from podpac.core.data.file import Rasterio - - -class MockRasterio(Rasterio): - """mock rasterio data source """ - - source = os.path.join(os.path.dirname(__file__), "assets/RGB.byte.tif") - band = 1 - - def get_native_coordinates(self): - return self.native_coordinates +from podpac.core.data.rasterio_source import Rasterio class TestRasterio(object): @@ -31,19 +21,6 @@ def test_init(self): """test basic init of class""" node = Rasterio(source=self.source, band=self.band) - assert isinstance(node, Rasterio) - - node = MockRasterio() - assert isinstance(node, MockRasterio) - - def test_traits(self): - """ check each of the rasterio traits """ - - with pytest.raises(TraitError): - Rasterio(source=5, band=self.band) - - with pytest.raises(TraitError): - Rasterio(source=self.source, band="test") def test_dataset(self): """test dataset attribute and trait default """ @@ -57,44 +34,43 @@ def test_dataset(self): node.close_dataset() - def test_default_native_coordinates(self): - """test default native coordinates implementations""" + def test_coordinates(self): + """test default coordinates implementations""" node = Rasterio(source=self.source) - native_coordinates = node.get_native_coordinates() - assert isinstance(native_coordinates, Coordinates) - assert len(native_coordinates["lat"]) == 718 + assert isinstance(node.coordinates, Coordinates) + assert len(node.coordinates["lat"]) == 718 def test_get_data(self): """test default get_data method""" node = Rasterio(source=self.source) - native_coordinates = node.get_native_coordinates() - output = node.eval(native_coordinates) - + output = node.eval(node.coordinates) assert isinstance(output, UnitsDataArray) - def test_band_descriptions(self): - """test band count method""" - node = Rasterio(source=self.source) - bands = node.band_descriptions - assert bands and isinstance(bands, OrderedDict) - def test_band_count(self): """test band descriptions methods""" node = Rasterio(source=self.source) - count = node.band_count - assert count and isinstance(count, int) + assert node.band_count == 3 + + def test_band_descriptions(self): + """test band count method""" + node = Rasterio(source=self.source) + assert isinstance(node.band_descriptions, OrderedDict) + assert list(node.band_descriptions.keys()) == [0, 1, 2] def test_band_keys(self): """test band keys methods""" node = Rasterio(source=self.source) - keys = node.band_keys - assert keys and isinstance(keys, dict) + assert set(node.band_keys.keys()) == { + "STATISTICS_STDDEV", + "STATISTICS_MINIMUM", + "STATISTICS_MEAN", + "STATISTICS_MAXIMUM", + } def test_get_band_numbers(self): """test band numbers methods""" node = Rasterio(source=self.source) numbers = node.get_band_numbers("STATISTICS_MINIMUM", "0") - assert isinstance(numbers, np.ndarray) - np.testing.assert_array_equal(numbers, np.arange(3) + 1) + np.testing.assert_array_equal(numbers, [1, 2, 3]) diff --git a/podpac/core/data/test/test_reprojected_source.py b/podpac/core/data/test/test_reprojected_source.py index 65f54e93b..9d2318a28 100644 --- a/podpac/core/data/test/test_reprojected_source.py +++ b/podpac/core/data/test/test_reprojected_source.py @@ -6,6 +6,8 @@ from podpac.core.coordinates import Coordinates, clinspace from podpac.core.units import UnitsDataArray from podpac.core.node import Node +from podpac.core.algorithm.utility import Arange +from podpac.core.data.datasource import DataSource from podpac.core.data.array_source import Array from podpac.core.data.reprojection import ReprojectedSource @@ -16,70 +18,88 @@ class TestReprojectedSource(object): TODO: this needs to be reworked with real examples """ - source = Node() data = np.random.rand(11, 11) - native_coordinates = Coordinates([clinspace(-25, 25, 11), clinspace(-25, 25, 11)], dims=["lat", "lon"]) + coordinates = Coordinates([clinspace(-25, 25, 11), clinspace(-25, 25, 11)], dims=["lat", "lon"]) reprojected_coordinates = Coordinates([clinspace(-25, 50, 11), clinspace(-25, 50, 11)], dims=["lat", "lon"]) def test_init(self): """test basic init of class""" - node = ReprojectedSource(source=self.source) + node = ReprojectedSource(source=Node(), reprojected_coordinates=self.reprojected_coordinates) assert isinstance(node, ReprojectedSource) - def test_traits(self): - """ check each of the s3 traits """ + def test_coordinates(self): + """test coordinates""" - ReprojectedSource(source=self.source) - with pytest.raises(TraitError): - ReprojectedSource(source=5) + # source has no coordinates, just use reprojected_coordinates + node = ReprojectedSource(source=Node(), reprojected_coordinates=self.reprojected_coordinates) + assert node.coordinates == self.reprojected_coordinates - ReprojectedSource(source_interpolation="bilinear") - with pytest.raises(TraitError): - ReprojectedSource(source_interpolation=5) - - ReprojectedSource(reprojected_coordinates=self.reprojected_coordinates) - with pytest.raises(TraitError): - ReprojectedSource(reprojected_coordinates=5) - - def test_native_coordinates(self): - """test native coordinates""" - - # error if no source has coordinates - with pytest.raises(Exception): - node = ReprojectedSource(source=Node()) - node.native_coordinates - - # source as Node - node = ReprojectedSource(source=self.source, reprojected_coordinates=self.reprojected_coordinates) - assert isinstance(node.native_coordinates, Coordinates) - assert node.native_coordinates["lat"].coordinates[0] == self.reprojected_coordinates["lat"].coordinates[0] + # source has coordinates + source = Array(coordinates=self.coordinates) + node = ReprojectedSource(source=source, reprojected_coordinates=self.reprojected_coordinates) + assert node.coordinates == self.reprojected_coordinates def test_get_data(self): """test get data from reprojected source""" - datanode = Array(source=self.data, native_coordinates=self.native_coordinates) - node = ReprojectedSource(source=datanode, reprojected_coordinates=datanode.native_coordinates) - output = node.eval(node.native_coordinates) - assert isinstance(output, UnitsDataArray) + source = Array(source=self.data, coordinates=self.coordinates) + node = ReprojectedSource(source=source, reprojected_coordinates=source.coordinates) + output = node.eval(node.coordinates) - def test_base_ref(self): - """test base ref""" + def test_source_interpolation(self): + """test get data from reprojected source""" - node = ReprojectedSource(source=self.source, reprojected_coordinates=self.reprojected_coordinates) - ref = node.base_ref - assert "_reprojected" in ref + # no source_interpolation + source = Array(source=self.data, coordinates=self.coordinates, interpolation="nearest") + node = ReprojectedSource(source=source, reprojected_coordinates=self.reprojected_coordinates) + assert source.interpolation == "nearest" + assert node.source.interpolation == "nearest" + assert node.eval_source.interpolation == "nearest" + assert node.eval_source.coordinates == source.coordinates + np.testing.assert_array_equal(node.eval_source.source, source.source) + + # matching source_interpolation + source = Array(source=self.data, coordinates=self.coordinates, interpolation="nearest") + node = ReprojectedSource( + source=source, reprojected_coordinates=self.reprojected_coordinates, source_interpolation="nearest" + ) + assert source.interpolation == "nearest" + assert node.source.interpolation == "nearest" + assert node.eval_source.interpolation == "nearest" + assert node.eval_source.coordinates == source.coordinates + np.testing.assert_array_equal(node.eval_source.source, source.source) + + # non-matching source_interpolation + source = Array(source=self.data, coordinates=self.coordinates, interpolation="nearest") + node = ReprojectedSource( + source=source, reprojected_coordinates=self.reprojected_coordinates, source_interpolation="bilinear" + ) + assert source.interpolation == "nearest" + assert node.source.interpolation == "nearest" + assert node.eval_source.interpolation == "bilinear" + assert node.eval_source.coordinates == source.coordinates + np.testing.assert_array_equal(node.eval_source.source, source.source) + + # no source.interpolation to set (trigger logger warning) + source = Node() + node = ReprojectedSource( + source=source, reprojected_coordinates=self.reprojected_coordinates, source_interpolation="bilinear" + ) + + def test_interpolation_warning(self): + node = ReprojectedSource(source=Arange(), reprojected_coordinates=self.coordinates) + output = node.eval(node.coordinates) - def test_base_definition(self): - """test definition""" + def test_base_ref(self): + """test base ref""" - node = ReprojectedSource(source=self.source, reprojected_coordinates=self.reprojected_coordinates) - d = node.base_definition - assert d["attrs"]["reprojected_coordinates"] == self.reprojected_coordinates + node = ReprojectedSource(source=Node(), reprojected_coordinates=self.reprojected_coordinates) + assert "_reprojected" in node.base_ref def test_deserialize_reprojected_coordinates(self): - node1 = ReprojectedSource(source=self.source, reprojected_coordinates=self.reprojected_coordinates) - node2 = ReprojectedSource(source=self.source, reprojected_coordinates=self.reprojected_coordinates.definition) - node3 = ReprojectedSource(source=self.source, reprojected_coordinates=self.reprojected_coordinates.json) + node1 = ReprojectedSource(source=Node(), reprojected_coordinates=self.reprojected_coordinates) + node2 = ReprojectedSource(source=Node(), reprojected_coordinates=self.reprojected_coordinates.definition) + node3 = ReprojectedSource(source=Node(), reprojected_coordinates=self.reprojected_coordinates.json) assert node1.reprojected_coordinates == self.reprojected_coordinates assert node2.reprojected_coordinates == self.reprojected_coordinates diff --git a/podpac/core/data/test/test_wcs.py b/podpac/core/data/test/test_wcs.py index cf1522723..011cc8874 100644 --- a/podpac/core/data/test/test_wcs.py +++ b/podpac/core/data/test/test_wcs.py @@ -81,11 +81,11 @@ def test_get_capabilities_url(self): """test the capabilities url generation""" node = WCS(source=self.source) - url = node.get_capabilities_url + url = node.capabilities_url assert isinstance(url, string_types) assert node.source in url - def test_get_wcs_coordinates(self): + def test_wcs_coordinates(self): """get wcs coordinates""" import podpac.core.data.ogc @@ -128,44 +128,44 @@ def test_get_wcs_coordinates(self): node = WCS(source=self.source) with pytest.raises(Exception): - node.get_wcs_coordinates() + node.wcs_coordinates # put all dependencies back podpac.core.data.ogc.requests = requests podpac.core.data.ogc.urllib3 = urllib3 podpac.core.data.ogc.lxml = lxml - def test_get_native_coordinates(self): - """get native coordinates""" + def test_coordinates(self): + """get coordinates""" self.mock_requests() node = WCS(source=self.source) # equal to wcs coordinates when no eval coordinates - native_coordinates = node.native_coordinates + coordinates = node.coordinates wcs_coordinates = node.wcs_coordinates - assert native_coordinates == wcs_coordinates + assert coordinates == wcs_coordinates # with eval coordinates # TODO: use real eval coordinates - node._output_coordinates = native_coordinates - native_coordinates = node.native_coordinates + node._output_coordinates = coordinates + coordinates = node.coordinates - assert isinstance(native_coordinates, Coordinates) + assert isinstance(coordinates, Coordinates) # TODO: one returns monotonic, the other returns uniform - assert native_coordinates == node._output_coordinates - assert native_coordinates["lat"] - assert native_coordinates["lon"] - assert native_coordinates["time"] + assert coordinates == node._output_coordinates + assert coordinates["lat"] + assert coordinates["lon"] + assert coordinates["time"] def test_get_data(self): """get data from wcs server""" self.mock_requests() node = WCS(source=self.source) - lat = node.native_coordinates["lat"].coordinates - lon = node.native_coordinates["lon"].coordinates - time = node.native_coordinates["time"].coordinates + lat = node.coordinates["lat"].coordinates + lon = node.coordinates["lon"].coordinates + time = node.coordinates["time"].coordinates # no time notime_coordinates = Coordinates( @@ -176,7 +176,7 @@ def test_get_data(self): with pytest.raises(ValueError): output = node.eval(notime_coordinates) assert isinstance(output, UnitsDataArray) - assert output.native_coordinates["lat"][0] == node.native_coordinates["lat"][0] + assert output.coordinates["lat"][0] == node.coordinates["lat"][0] # time time_coordinates = Coordinates( diff --git a/podpac/core/data/test/test_zarr.py b/podpac/core/data/test/test_zarr.py index 64a6dadfe..672b54cf8 100644 --- a/podpac/core/data/test/test_zarr.py +++ b/podpac/core/data/test/test_zarr.py @@ -6,7 +6,7 @@ from traitlets import TraitError from podpac.core.coordinates import Coordinates -from podpac.core.data.file import Zarr +from podpac.core.data.zarr_source import Zarr class TestZarr(object): @@ -24,13 +24,13 @@ def test_dims(self): node = Zarr(source=self.path) assert node.dims == ["lat", "lon"] - def test_available_keys(self): + def test_available_data_keys(self): node = Zarr(source=self.path) - assert node.available_keys == ["a", "b"] + assert node.available_data_keys == ["a", "b"] - def test_native_coordinates(self): + def test_coordinates(self): node = Zarr(source=self.path, data_key="a") - assert node.native_coordinates == Coordinates([[0, 1, 2], [10, 20, 30, 40]], dims=["lat", "lon"]) + assert node.coordinates == Coordinates([[0, 1, 2], [10, 20, 30, 40]], dims=["lat", "lon"]) def test_eval(self): coords = Coordinates([0, 10], dims=["lat", "lon"]) @@ -44,7 +44,7 @@ def test_eval(self): def test_eval_multiple(self): coords = Coordinates([0, 10], dims=["lat", "lon"]) - z = Zarr(source=self.path, output_keys=["a", "b"]) + z = Zarr(source=self.path, data_key=["a", "b"]) out = z.eval(coords) assert out.dims == ("lat", "lon", "output") np.testing.assert_array_equal(out["output"], ["a", "b"]) @@ -52,14 +52,14 @@ def test_eval_multiple(self): assert out.sel(output="b")[0, 0] == 1.0 # single output key - z = Zarr(source=self.path, output_keys=["a"]) + z = Zarr(source=self.path, data_key=["a"]) out = z.eval(coords) assert out.dims == ("lat", "lon", "output") np.testing.assert_array_equal(out["output"], ["a"]) assert out.sel(output="a")[0, 0] == 0.0 # alternate output names - z = Zarr(source=self.path, output_keys=["a", "b"], outputs=["A", "B"]) + z = Zarr(source=self.path, data_key=["a", "b"], outputs=["A", "B"]) out = z.eval(coords) assert out.dims == ("lat", "lon", "output") np.testing.assert_array_equal(out["output"], ["A", "B"]) @@ -79,7 +79,3 @@ def test_s3(self): path = "s3://podpac-internal-test/drought_parameters.zarr" node = Zarr(source=path, data_key="d0") node.close_dataset() - - def test_used_dataset_directly(self): - dataset = zarr.open(self.path, "r") - node = Zarr(dataset=dataset, data_key="a") diff --git a/podpac/core/data/zarr_source.py b/podpac/core/data/zarr_source.py new file mode 100644 index 000000000..5cd79b26a --- /dev/null +++ b/podpac/core/data/zarr_source.py @@ -0,0 +1,175 @@ +import os +import traitlets as tl +import numpy as np + +from lazy_import import lazy_module, lazy_class + +zarr = lazy_module("zarr") +zarrGroup = lazy_class("zarr.Group") + +from podpac.core.authentication import S3Mixin +from podpac.core.utils import common_doc, cached_property +from podpac.core.data.datasource import COMMON_DATA_DOC, DATA_DOC +from podpac.core.data.file_source import BaseFileSource, FileKeysMixin + + +class Zarr(S3Mixin, FileKeysMixin, BaseFileSource): + """Create a DataSource node using zarr. + + Attributes + ---------- + source : str + Path to the Zarr archive + file_mode : str, optional + Default is 'r'. The mode used to open the Zarr archive. Options are r, r+, w, w- or x, a. + dataset : zarr.Group + The h5py file object used to read the file + coordinates : Coordinates + {coordinates} + data_key : str, int + data key, default 'data' + lat_key : str, int + latitude coordinates key, default 'lat' + lon_key : str, int + longitude coordinates key, default 'lon' + time_key : str, int + time coordinates key, default 'time' + alt_key : str, int + altitude coordinates key, default 'alt' + crs : str + Coordinate reference system of the coordinates + cf_time : bool + decode CF datetimes + cf_units : str + units, when decoding CF datetimes + cf_calendar : str + calendar, when decoding CF datetimes + """ + + file_mode = tl.Unicode(default_value="r").tag(readonly=True) + coordinate_index_type = "slice" + + def _get_store(self): + if self.source.startswith("s3://"): + s3fs = lazy_module("s3fs") + root = self.source.strip("s3://") + s3map = s3fs.S3Map(root=root, s3=self.s3, check=False) + store = s3map + else: + store = str(self.source) # has to be a string in Python2.7 for local files + return store + + def chunk_exists(self, index=None, chunk_str=None, data_key=None, list_dir=[]): + """ + Test to see if a chunk exists for a particular slice. + Note: Only the start of the index is used. + + Parameters + ----------- + index: tuple(slice), optional + Default is None. A tuple of slices indicating the data that the users wants to access + chunk_str: str, optional + Default is None. A string equivalent to the filename of the chunk (.e.g. "1.0.5") + data_key: str, optional + Default is None. The data_key for the zarr array that will be queried. + list_dir: list, optional + A list of existing paths -- used in lieu of 'exist' calls + """ + za = self.dataset + if data_key: + za = za[data_key] + else: + data_key = "" + + if index: + chunk_str = ".".join([str(int(s.start // za.chunks[i])) for i, s in enumerate(index)]) + + if not index and not chunk_str: + raise ValueError("Either the index or chunk_str needs to be specified") + + path = os.path.join(self.source, data_key, chunk_str) + if list_dir: + return chunk_str in list_dir + + if self.source.startswith("s3:"): + fs = self.s3 + path = path.replace("\\", "/") + else: + fs = os.path + + return fs.exists(path) + + def list_dir(self, data_key=None): + za = self.dataset + if data_key: + za = za[data_key] + else: + data_key = "" + + path = os.path.join(self.source, data_key) + if self.source.startswith("s3:"): + path = path.replace("\\", "/") + ld = self.s3.ls(path) + else: + ld = os.listdir(path) + + return ld + + @cached_property + def dataset(self): + store = self._get_store() + try: + return zarr.open(store, mode=self.file_mode) + except ValueError: + raise ValueError("No Zarr store found at path '%s'" % self.source) + + # ------------------------------------------------------------------------- + # public api methods + # ------------------------------------------------------------------------- + + @cached_property + def dims(self): + if not isinstance(self.data_key, list): + key = self.data_key + else: + key = self.data_key[0] + try: + return self.dataset[key].attrs["_ARRAY_DIMENSIONS"] + except: + lookup = {self.lat_key: "lat", self.lon_key: "lon", self.alt_key: "alt", self.time_key: "time"} + return [lookup[key] for key in self.dataset if key in lookup] + + def _add_keys(self, base_keys): + keys = base_keys.copy() + for bk in base_keys: + try: + new_keys = [bk + "/" + k for k in self.dataset[bk].keys()] + keys.extend(new_keys) + + # Remove the group key + keys.pop(keys.index(bk)) + except AttributeError: + pass + return keys + + @cached_property + def keys(self): + keys = list(self.dataset.keys()) + full_keys = self._add_keys(keys) + while keys != full_keys: + keys = full_keys.copy() + full_keys = self._add_keys(keys) + + return full_keys + + @common_doc(COMMON_DATA_DOC) + def get_data(self, coordinates, coordinates_index): + """{get_data} + """ + data = self.create_output_array(coordinates) + if not isinstance(self.data_key, list): + data[:] = self.dataset[self.data_key][coordinates_index] + else: + for key, name in zip(self.data_key, self.outputs): + data.sel(output=name)[:] = self.dataset[key][coordinates_index] + return data diff --git a/podpac/core/interpolation/__init__.py b/podpac/core/interpolation/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/podpac/core/data/interpolation.py b/podpac/core/interpolation/interpolation.py similarity index 97% rename from podpac/core/data/interpolation.py rename to podpac/core/interpolation/interpolation.py index 345d16af6..f4d304286 100644 --- a/podpac/core/data/interpolation.py +++ b/podpac/core/interpolation/interpolation.py @@ -7,8 +7,8 @@ import numpy as np # podpac imports -from podpac.core.data.interpolator import Interpolator -from podpac.core.data.interpolators import NearestNeighbor, NearestPreview, Rasterio, ScipyPoint, ScipyGrid +from podpac.core.interpolation.interpolator import Interpolator +from podpac.core.interpolation.interpolators import NearestNeighbor, NearestPreview, Rasterio, ScipyPoint, ScipyGrid INTERPOLATION_DEFAULT = "nearest" """str : Default interpolation method used when creating a new :class:`Interpolation` class """ @@ -65,6 +65,8 @@ def load_interpolators(): # load interpolators when module is first loaded +# TODO does this really only load once? +# TODO maybe move this whole section? load_interpolators() @@ -76,22 +78,6 @@ class InterpolationException(Exception): pass -def interpolation_trait(default_value=INTERPOLATION_DEFAULT, allow_none=True, **kwargs): - """Create a new interpolation trait - - Returns - ------- - tl.Union - Union trait for an interpolation definition - """ - return tl.Union( - [tl.Dict(), tl.List(), tl.Enum(INTERPOLATION_METHODS), tl.Instance(Interpolation)], - allow_none=allow_none, - default_value=default_value, - **kwargs, - ) - - class Interpolation(object): """Create an interpolation class to handle one interpolation method per unstacked dimension. Used to interpolate data within a datasource. @@ -514,3 +500,15 @@ def interpolate(self, source_coordinates, source_data, eval_coordinates, output_ ) return output_data + + +class InterpolationTrait(tl.Union): + default_value = INTERPOLATION_DEFAULT + + def __init__( + self, + trait_types=[tl.Dict(), tl.List(), tl.Enum(INTERPOLATION_METHODS), tl.Instance(Interpolation)], + *args, + **kwargs + ): + super(InterpolationTrait, self).__init__(trait_types=trait_types, *args, **kwargs) diff --git a/podpac/core/data/interpolator.py b/podpac/core/interpolation/interpolator.py similarity index 98% rename from podpac/core/data/interpolator.py rename to podpac/core/interpolation/interpolator.py index 079213256..7b57b7e71 100644 --- a/podpac/core/data/interpolator.py +++ b/podpac/core/interpolation/interpolator.py @@ -13,6 +13,7 @@ import numpy as np import traitlets as tl +import six # Set up logging _log = logging.getLogger(__name__) @@ -92,7 +93,7 @@ class is constructed. See the :class:`podpac.data.DataSource` `interpolation` at ------- (:class:`podpac.Coordinates`, list) returns the new down selected coordinates and the new associated index. These coordinates must exist - in the native coordinates of the source data + in the coordinates of the source data Raises ------ @@ -249,7 +250,7 @@ def _dim_in(self, dim, *coords, **kwargs): unstacked = kwargs.pop("unstacked", False) - if isinstance(dim, str): + if isinstance(dim, six.string_types): dim = [dim] elif not isinstance(dim, (list, tuple)): raise ValueError("`dim` input must be a str, list of str, or tuple of str") @@ -312,7 +313,7 @@ def _loop_helper( tol = self.spatial_tolerance diff = np.abs(source_data.coords[dim].values - i.values) - if tol == None or diff <= tol: + if tol == None or np.any(diff <= tol): src_i = (diff).argmin() src_idx = {dim: source_data.coords[dim][src_i]} else: @@ -329,7 +330,7 @@ def _loop_helper( source_data.loc[src_idx], eval_coordinates, output_data.loc[idx], - **kwargs, + **kwargs ) else: return func(udims, source_coordinates, source_data, eval_coordinates, output_data, **kwargs) diff --git a/podpac/core/data/interpolators.py b/podpac/core/interpolation/interpolators.py similarity index 99% rename from podpac/core/data/interpolators.py rename to podpac/core/interpolation/interpolators.py index 96d8ade11..6e06785ca 100644 --- a/podpac/core/data/interpolators.py +++ b/podpac/core/interpolation/interpolators.py @@ -22,7 +22,7 @@ scipy = None # podac imports -from podpac.core.data.interpolator import COMMON_INTERPOLATOR_DOCS, Interpolator, InterpolatorException +from podpac.core.interpolation.interpolator import COMMON_INTERPOLATOR_DOCS, Interpolator, InterpolatorException from podpac.core.units import UnitsDataArray from podpac.core.coordinates import Coordinates, UniformCoordinates1d, StackedCoordinates from podpac.core.utils import common_doc @@ -196,7 +196,7 @@ def select_coordinates(self, udims, source_coordinates, source_coordinates_index new_coords.append(c) new_coords_idx.append(idx) - return Coordinates(new_coords), tuple(new_coords_idx) + return Coordinates(new_coords, validate_crs=False), tuple(new_coords_idx) @common_doc(COMMON_INTERPOLATOR_DOCS) diff --git a/podpac/core/data/test/test_interpolate.py b/podpac/core/interpolation/test/test_interpolate.py similarity index 89% rename from podpac/core/data/test/test_interpolate.py rename to podpac/core/interpolation/test/test_interpolate.py index ad0ccaa69..1faa8e04c 100644 --- a/podpac/core/data/test/test_interpolate.py +++ b/podpac/core/interpolation/test/test_interpolate.py @@ -12,11 +12,12 @@ import traitlets as tl import numpy as np +from podpac.core.utils import ArrayTrait from podpac.core.units import UnitsDataArray from podpac.core.coordinates import Coordinates, clinspace -from podpac.core.data.file import rasterio -from podpac.core.data import datasource -from podpac.core.data.interpolation import ( +from podpac.core.data.rasterio_source import rasterio +from podpac.core.data.datasource import DataSource +from podpac.core.interpolation.interpolation import ( Interpolation, InterpolationException, INTERPOLATORS, @@ -26,11 +27,16 @@ INTERPOLATION_METHODS_DICT, ) -from podpac.core.data.interpolator import Interpolator, InterpolatorException -from podpac.core.data.interpolators import NearestNeighbor, NearestPreview, Rasterio, ScipyGrid, ScipyPoint +from podpac.core.interpolation.interpolator import Interpolator, InterpolatorException +from podpac.core.interpolation.interpolators import NearestNeighbor, NearestPreview, Rasterio, ScipyGrid, ScipyPoint -# test fixtures -from podpac.core.data.test.test_datasource import MockArrayDataSource + +class MockArrayDataSource(DataSource): + data = ArrayTrait().tag(attr=True) + coordinates = tl.Instance(Coordinates).tag(attr=True) + + def get_data(self, coordinates, coordinates_index): + return self.create_output_array(coordinates, data=self.data[coordinates_index]) class TestInterpolation(object): @@ -437,7 +443,7 @@ def test_interpolation(self): # unstacked 1D source = np.random.rand(5) coords_src = Coordinates([np.linspace(0, 10, 5)], dims=["lat"]) - node = MockArrayDataSource(source=source, native_coordinates=coords_src, interpolation=interpolation) + node = MockArrayDataSource(data=source, coordinates=coords_src, interpolation=interpolation) coords_dst = Coordinates([[1, 1.2, 1.5, 5, 9]], dims=["lat"]) output = node.eval(coords_dst) @@ -451,7 +457,7 @@ def test_interpolation(self): coords_src = Coordinates([clinspace(0, 10, 5), clinspace(0, 10, 5)], dims=["lat", "lon"]) coords_dst = Coordinates([clinspace(2, 12, 5), clinspace(2, 12, 5)], dims=["lat", "lon"]) - node = MockArrayDataSource(source=source, native_coordinates=coords_src, interpolation=interpolation) + node = MockArrayDataSource(data=source, coordinates=coords_src, interpolation=interpolation) output = node.eval(coords_dst) assert isinstance(output, UnitsDataArray) @@ -462,8 +468,11 @@ def test_interpolation(self): # TODO: implement stacked handling source = np.random.rand(5) coords_src = Coordinates([(np.linspace(0, 10, 5), np.linspace(0, 10, 5))], dims=["lat_lon"]) - node = MockArrayDataSource(source=source, native_coordinates=coords_src) - node.interpolation = {"method": "nearest", "interpolators": [NearestNeighbor]} + node = MockArrayDataSource( + data=source, + coordinates=coords_src, + interpolation={"method": "nearest", "interpolators": [NearestNeighbor]}, + ) coords_dst = Coordinates([(np.linspace(1, 9, 3), np.linspace(1, 9, 3))], dims=["lat_lon"]) with pytest.raises(InterpolationException): @@ -473,8 +482,11 @@ def test_interpolation(self): # source = stacked, dest = unstacked source = np.random.rand(5) coords_src = Coordinates([(np.linspace(0, 10, 5), np.linspace(0, 10, 5))], dims=["lat_lon"]) - node = MockArrayDataSource(source=source, native_coordinates=coords_src) - node.interpolation = {"method": "nearest", "interpolators": [NearestNeighbor]} + node = MockArrayDataSource( + data=source, + coordinates=coords_src, + interpolation={"method": "nearest", "interpolators": [NearestNeighbor]}, + ) coords_dst = Coordinates([np.linspace(1, 9, 3), np.linspace(1, 9, 3)], dims=["lat", "lon"]) with pytest.raises(InterpolationException): @@ -484,8 +496,11 @@ def test_interpolation(self): # source = unstacked, dest = stacked source = np.random.rand(5, 5) coords_src = Coordinates([np.linspace(0, 10, 5), np.linspace(0, 10, 5)], dims=["lat", "lon"]) - node = MockArrayDataSource(source=source, native_coordinates=coords_src) - node.interpolation = {"method": "nearest", "interpolators": [NearestNeighbor]} + node = MockArrayDataSource( + data=source, + coordinates=coords_src, + interpolation={"method": "nearest", "interpolators": [NearestNeighbor]}, + ) coords_dst = Coordinates([(np.linspace(1, 9, 3), np.linspace(1, 9, 3))], dims=["lat_lon"]) with pytest.raises(InterpolationException): @@ -497,8 +512,8 @@ def test_spatial_tolerance(self): source = np.random.rand(5) coords_src = Coordinates([np.linspace(0, 10, 5)], dims=["lat"]) node = MockArrayDataSource( - source=source, - native_coordinates=coords_src, + data=source, + coordinates=coords_src, interpolation={"method": "nearest", "params": {"spatial_tolerance": 1.1}}, ) @@ -519,8 +534,8 @@ def test_time_tolerance(self): [np.linspace(0, 10, 5), clinspace("2018-01-01", "2018-01-09", 5)], dims=["lat", "time"] ) node = MockArrayDataSource( - source=source, - native_coordinates=coords_src, + data=source, + coordinates=coords_src, interpolation={ "method": "nearest", "params": {"spatial_tolerance": 1.1, "time_tolerance": np.timedelta64(1, "D")}, @@ -558,8 +573,9 @@ def test_interpolate_rasterio(self): coords_dst = Coordinates([clinspace(1, 11, 3), clinspace(1, 11, 5)], dims=["lat", "lon"]) # try one specific rasterio case to measure output - node = MockArrayDataSource(source=source, native_coordinates=coords_src) - node.interpolation = {"method": "min", "interpolators": [Rasterio]} + node = MockArrayDataSource( + data=source, coordinates=coords_src, interpolation={"method": "min", "interpolators": [Rasterio]} + ) output = node.eval(coords_dst) assert isinstance(output, UnitsDataArray) @@ -567,14 +583,18 @@ def test_interpolate_rasterio(self): assert output.data[0, 3] == 3.0 assert output.data[0, 4] == 4.0 - node.interpolation = {"method": "max", "interpolators": [Rasterio]} + node = MockArrayDataSource( + data=source, coordinates=coords_src, interpolation={"method": "max", "interpolators": [Rasterio]} + ) output = node.eval(coords_dst) assert isinstance(output, UnitsDataArray) assert np.all(output.lat.values == coords_dst.coords["lat"]) assert output.data[0, 3] == 9.0 assert output.data[0, 4] == 9.0 - node.interpolation = {"method": "bilinear", "interpolators": [Rasterio]} + node = MockArrayDataSource( + data=source, coordinates=coords_src, interpolation={"method": "bilinear", "interpolators": [Rasterio]} + ) output = node.eval(coords_dst) assert isinstance(output, UnitsDataArray) assert np.all(output.lat.values == coords_dst.coords["lat"]) @@ -589,9 +609,7 @@ def test_interpolate_rasterio_descending(self): coords_dst = Coordinates([clinspace(2, 12, 5), clinspace(2, 12, 5)], dims=["lat", "lon"]) node = MockArrayDataSource( - source=source, - native_coordinates=coords_src, - interpolation={"method": "nearest", "interpolators": [Rasterio]}, + data=source, coordinates=coords_src, interpolation={"method": "nearest", "interpolators": [Rasterio]} ) output = node.eval(coords_dst) @@ -611,8 +629,9 @@ def test_interpolate_scipy_grid(self): coords_dst = Coordinates([clinspace(1, 11, 5), clinspace(1, 11, 5)], dims=["lat", "lon"]) # try one specific rasterio case to measure output - node = MockArrayDataSource(source=source, native_coordinates=coords_src) - node.interpolation = {"method": "nearest", "interpolators": [ScipyGrid]} + node = MockArrayDataSource( + data=source, coordinates=coords_src, interpolation={"method": "nearest", "interpolators": [ScipyGrid]} + ) output = node.eval(coords_dst) assert isinstance(output, UnitsDataArray) @@ -623,14 +642,20 @@ def test_interpolate_scipy_grid(self): assert output.data[1, 3] == 8.0 assert np.isnan(output.data[0, 4]) # TODO: how to handle outside bounds - node.interpolation = {"method": "cubic_spline", "interpolators": [ScipyGrid]} + node = MockArrayDataSource( + data=source, + coordinates=coords_src, + interpolation={"method": "cubic_spline", "interpolators": [ScipyGrid]}, + ) output = node.eval(coords_dst) assert isinstance(output, UnitsDataArray) assert np.all(output.lat.values == coords_dst.coords["lat"]) assert int(output.data[0, 0]) == 2 assert int(output.data[2, 4]) == 16 - node.interpolation = {"method": "bilinear", "interpolators": [ScipyGrid]} + node = MockArrayDataSource( + data=source, coordinates=coords_src, interpolation={"method": "bilinear", "interpolators": [ScipyGrid]} + ) output = node.eval(coords_dst) assert isinstance(output, UnitsDataArray) assert np.all(output.lat.values == coords_dst.coords["lat"]) @@ -647,9 +672,7 @@ def test_interpolate_irregular_arbitrary_2dims(self): coords_dst = Coordinates([clinspace(1, 11, 5), clinspace(1, 11, 5), [2, 3, 5]], dims=["lat", "lon", "time"]) node = MockArrayDataSource( - source=source, - native_coordinates=coords_src, - interpolation={"method": "nearest", "interpolators": [ScipyGrid]}, + data=source, coordinates=coords_src, interpolation={"method": "nearest", "interpolators": [ScipyGrid]} ) output = node.eval(coords_dst) @@ -668,9 +691,7 @@ def test_interpolate_irregular_arbitrary_descending(self): coords_dst = Coordinates([clinspace(2, 12, 5), clinspace(2, 12, 5)], dims=["lat", "lon"]) node = MockArrayDataSource( - source=source, - native_coordinates=coords_src, - interpolation={"method": "nearest", "interpolators": [ScipyGrid]}, + data=source, coordinates=coords_src, interpolation={"method": "nearest", "interpolators": [ScipyGrid]} ) output = node.eval(coords_dst) @@ -686,9 +707,7 @@ def test_interpolate_irregular_arbitrary_swap(self): coords_dst = Coordinates([clinspace(2, 12, 5), clinspace(2, 12, 5)], dims=["lat", "lon"]) node = MockArrayDataSource( - source=source, - native_coordinates=coords_src, - interpolation={"method": "nearest", "interpolators": [ScipyGrid]}, + data=source, coordinates=coords_src, interpolation={"method": "nearest", "interpolators": [ScipyGrid]} ) output = node.eval(coords_dst) @@ -704,9 +723,7 @@ def test_interpolate_irregular_lat_lon(self): coords_dst = Coordinates([[[0, 2, 4, 6, 8, 10], [0, 2, 4, 5, 6, 10]]], dims=["lat_lon"]) node = MockArrayDataSource( - source=source, - native_coordinates=coords_src, - interpolation={"method": "nearest", "interpolators": [ScipyGrid]}, + data=source, coordinates=coords_src, interpolation={"method": "nearest", "interpolators": [ScipyGrid]} ) output = node.eval(coords_dst) @@ -724,9 +741,7 @@ def test_interpolate_scipy_point(self): coords_src = Coordinates([[[0, 2, 4, 6, 8, 10], [0, 2, 4, 5, 6, 10]]], dims=["lat_lon"]) coords_dst = Coordinates([[[1, 2, 3, 4, 5], [1, 2, 3, 4, 5]]], dims=["lat_lon"]) node = MockArrayDataSource( - source=source, - native_coordinates=coords_src, - interpolation={"method": "nearest", "interpolators": [ScipyPoint]}, + data=source, coordinates=coords_src, interpolation={"method": "nearest", "interpolators": [ScipyPoint]} ) output = node.eval(coords_dst) diff --git a/podpac/core/managers/aws.py b/podpac/core/managers/aws.py index 432e8e9c8..bd2ee3f76 100644 --- a/podpac/core/managers/aws.py +++ b/podpac/core/managers/aws.py @@ -11,6 +11,8 @@ import base64 from datetime import datetime +from six import string_types + import boto3 import botocore import traitlets as tl @@ -36,7 +38,7 @@ class LambdaException(Exception): class Lambda(Node): """A `Node` wrapper to evaluate source on AWS Lambda function - + Attributes ---------- @@ -58,10 +60,12 @@ class Lambda(Node): S3 bucket name to use with lambda function. Defaults to :str:`podpac.settings["S3_BUCKET_NAME"]` or "podpac-autogen-" with the timestamp to ensure uniqueness. eval_settings : dict, optional Default is podpac.settings. PODPAC settings that will be used to evaluate the Lambda function. + eval_timeout : float, optional + Default is None. The amount of time to wait for an eval to return. To get near asynchronous response, set this to a small number. Other Attributes ---------------- - attrs : dict + node_attrs : dict Additional attributes passed on to the Lambda definition of the base node download_result : Bool Flag that indicated whether node should wait to download the data. @@ -138,6 +142,8 @@ class Lambda(Node): Defaults to "USD". See https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/budgets.html#Budgets.Client.create_budget for currency (or Unit) options. + output_format : dict, optional + Definition for how output is saved after results are computed. session : :class:`podpac.managers.aws.Session` AWS Session to use for this node. source : :class:`podpac.Node` @@ -164,7 +170,7 @@ def _session_default(self): ) # general function parameters - function_eval_trigger = tl.Enum(["eval", "S3", "APIGateway"], default_value="eval").tag(attr=True, readonly=True) + function_eval_trigger = tl.Enum(["eval", "S3", "APIGateway"], default_value="eval").tag(attr=True) # lambda function parameters function_name = tl.Unicode().tag(attr=True, readonly=True) # see default below @@ -184,7 +190,7 @@ def _session_default(self): function_source_bucket = tl.Unicode(default_value="podpac-dist", allow_none=True).tag(readonly=True) function_source_dist_key = tl.Unicode().tag(readonly=True) # see default below function_source_dependencies_key = tl.Unicode().tag(readonly=True) # see default below - function_allow_unsafe_eval = tl.Bool(default_value=False).tag(readonly=True) + function_allow_unsafe_eval = tl.Bool().tag(readonly=True) # see default below function_restrict_pipelines = tl.List(tl.Unicode(), default_value=[]).tag(readonly=True) _function_arn = tl.Unicode(default_value=None, allow_none=True) _function_last_modified = tl.Unicode(default_value=None, allow_none=True) @@ -194,6 +200,12 @@ def _session_default(self): _function_valid = tl.Bool(default_value=False, allow_none=True) _function = tl.Dict(default_value=None, allow_none=True) # raw response from AWS on "get_" + output_format = tl.Dict(None, allow_none=True).tag(attr=True) + + @property + def outputs(self): + return self.source.outputs + @tl.default("function_name") def _function_name_default(self): if settings["FUNCTION_NAME"] is None: @@ -228,6 +240,12 @@ def _function_source_dependencies_key_default(self): def _function_tags_default(self): return settings["AWS_TAGS"] or {} + @tl.default("function_allow_unsafe_eval") + def _function_allow_unsafe_eval_default(self): + return "UNSAFE_EVAL_HASH" in self.eval_settings and isinstance( + self.eval_settings["UNSAFE_EVAL_HASH"], string_types + ) + # role parameters function_role_name = tl.Unicode().tag(readonly=True) # see default below function_role_description = tl.Unicode(default_value="PODPAC Lambda Role").tag(readonly=True) @@ -378,10 +396,11 @@ def _function_budget_name_default(self): source = tl.Instance(Node, allow_none=True).tag(attr=True) source_output_format = tl.Unicode(default_value="netcdf") source_output_name = tl.Unicode() - attrs = tl.Dict() + node_attrs = tl.Dict() download_result = tl.Bool(True).tag(attr=True) force_compute = tl.Bool().tag(attr=True) eval_settings = tl.Dict().tag(attr=True) + eval_timeout = tl.Float(610).tag(attr=True) @tl.default("source_output_name") def _source_output_name_default(self): @@ -405,9 +424,9 @@ def pipeline(self): """ d = OrderedDict() d["pipeline"] = self.source.definition - if self.attrs: + if self.node_attrs: out_node = next(reversed(d["pipeline"].keys())) - d["pipeline"][out_node]["attrs"].update(self.attrs) + d["pipeline"][out_node]["attrs"].update(self.node_attrs) d["output"] = {"format": self.source_output_format} d["settings"] = self.eval_settings return d @@ -1393,26 +1412,44 @@ def _create_eval_pipeline(self, coordinates): pipeline["settings"][ "FUNCTION_DEPENDENCIES_KEY" ] = self.function_s3_dependencies_key # overwrite in case this is specified explicitly by class + if self.output_format: + pipeline["output"] = self.output_format return pipeline def _eval_invoke(self, coordinates, output=None): """eval node through invoke trigger""" - _log.debug("Evaluating pipeline via invoke") # create eval pipeline pipeline = self._create_eval_pipeline(coordinates) # create lambda client - awslambda = self.session.client("lambda") + config = botocore.config.Config( + read_timeout=self.eval_timeout, max_pool_connections=1001, retries={"max_attempts": 0} + ) + awslambda = self.session.client("lambda", config=config) - # invoke + # pipeline payload payload = bytes(json.dumps(pipeline, indent=4, cls=JSONEncoder).encode("UTF-8")) - response = awslambda.invoke( - FunctionName=self.function_name, - LogType="Tail", # include the execution log in the response. - Payload=payload, - ) + + if self.download_result: + _log.debug("Evaluating pipeline via invoke synchronously") + response = awslambda.invoke( + FunctionName=self.function_name, + LogType="Tail", # include the execution log in the response. + Payload=payload, + ) + else: + # async invocation + _log.debug("Evaluating pipeline via invoke asynchronously") + awslambda.invoke( + FunctionName=self.function_name, + InvocationType="Event", + LogType="Tail", # include the execution log in the response. + Payload=payload, + ) + + return _log.debug("Received response from lambda function") @@ -1428,7 +1465,11 @@ def _eval_invoke(self, coordinates, output=None): # After waiting, load the pickle file like this: payload = response["Payload"].read() - self._output = UnitsDataArray.open(payload) + try: + self._output = UnitsDataArray.open(payload) + except ValueError: + # Not actually a data-array, returning a string instead + return payload.decode("utf-8") return self._output def _eval_s3(self, coordinates, output=None): @@ -1708,7 +1749,7 @@ def put_object(session, bucket_name, bucket_path, file=None, object_acl="private object_config = {"ACL": object_acl, "Bucket": bucket_name, "Key": bucket_path} object_body = None - if isinstance(file, str): + if isinstance(file, string_types): with open(file, "rb") as f: object_body = f.read() else: diff --git a/podpac/core/managers/aws_lambda.py b/podpac/core/managers/aws_lambda.py deleted file mode 100644 index 72f0a2b44..000000000 --- a/podpac/core/managers/aws_lambda.py +++ /dev/null @@ -1,12 +0,0 @@ -""" -:deprecated: See `aws` module -""" - -import warnings - -warnings.warn( - "The `aws_lambda` module is deprecated and will be removed in podpac 2.0. See the `aws` module " - "for AWS management utilites", - DeprecationWarning, -) -from podpac.core.managers.aws import * diff --git a/podpac/core/managers/multi_process.py b/podpac/core/managers/multi_process.py new file mode 100644 index 000000000..4cbbf01b0 --- /dev/null +++ b/podpac/core/managers/multi_process.py @@ -0,0 +1,74 @@ +from __future__ import division, unicode_literals, print_function, absolute_import + +import sys + +from multiprocessing import Process as mpProcess +from multiprocessing import Queue +import traitlets as tl +import logging + +from podpac.core.node import Node +from podpac.core.utils import NodeTrait +from podpac.core.coordinates import Coordinates +from podpac.core.settings import settings + +# Set up logging +_log = logging.getLogger(__name__) + + +def _f(definition, coords, q, outputkw): + try: + n = Node.from_json(definition) + c = Coordinates.from_json(coords) + o = n.eval(c) + o.serialize() + _log.debug("o.shape: {}, output_format: {}".format(o.shape, outputkw)) + if outputkw: + _log.debug("Saving output results to output format {}".format(outputkw)) + o = o.to_format(outputkw["format"], **outputkw.get("format_kwargs")) + q.put(o) + except Exception as e: + q.put(str(e)) + + +class Process(Node): + """ + Source node will be evaluated in another process, and it is blocking! + """ + + source = NodeTrait().tag(attr=True) + output_format = tl.Dict(None, allow_none=True).tag(attr=True) + timeout = tl.Int(None, allow_none=True) + block = tl.Bool(True) + + @property + def outputs(self): + return self.source.outputs + + def eval(self, coordinates, output=None): + definition = self.source.json + coords = coordinates.json + + q = Queue() + process = mpProcess(target=_f, args=(definition, coords, q, self.output_format)) + process.daemon = True + _log.debug("Starting process.") + process.start() + _log.debug("Retrieving data from queue.") + o = q.get(timeout=self.timeout, block=self.block) + _log.debug("Joining.") + process.join() # This is blocking! + _log.debug("Closing.") + if (sys.version_info.major + sys.version_info.minor / 10.0) >= 3.7: + process.close() # New in version Python 3.7 + if isinstance(o, str): + raise Exception(o) + if o is None: + return + o.deserialize() + if output is not None: + output[:] = o.data[:] + else: + output = o + + return output diff --git a/podpac/core/managers/parallel.py b/podpac/core/managers/parallel.py new file mode 100644 index 000000000..d50d74681 --- /dev/null +++ b/podpac/core/managers/parallel.py @@ -0,0 +1,395 @@ +""" +Module to help farm out computation to multiple workers and save the results in a zarr file. +""" + +from __future__ import division, unicode_literals, print_function, absolute_import + +import time +import logging +import traitlets as tl +import numpy as np + +from multiprocessing.pool import ThreadPool + +from podpac.core.managers.multi_threading import Lock +from podpac.core.node import Node +from podpac.core.utils import NodeTrait +from podpac.core.data.zarr_source import Zarr +from podpac.core.coordinates import Coordinates, merge_dims + +# Optional dependencies +from lazy_import import lazy_module, lazy_class + +zarr = lazy_module("zarr") +zarrGroup = lazy_class("zarr.Group") +botocore = lazy_module("botocore") + +# Set up logging +_log = logging.getLogger(__name__) + + +class Parallel(Node): + """ + This class launches the parallel node evaluations in separate threads. As such, the node does not need to return + immediately (i.e. does NOT have to be asynchronous). For asynchronous nodes + (i.e. aws.Lambda with download_result=False) use ParrallelAsync + + Attributes + ----------- + chunks: dict + Dictionary of dimensions and sizes that will be iterated over. If a dimension is not in this dictionary, the + size of the eval coordinates will be used for the chunk. In this case, it may not be possible to automatically + set the coordinates of missing dimensions in the final file. + fill_output: bool + Default is True. When True, the final results will be assembled and returned to the user. If False, the final + results should be written to a file by specifying the output_format in a Process or Lambda node. + See note below. + source: podpac.Node + The source dataset for the computation + number_of_workers: int + Default is 1. Number of parallel process workers at one time. + start_i: int, optional + Default is 0. Starting chunk. This allow you to restart a run without having to check/submit 1000's of workers + before getting back to where you were. Empty chunks make the submission slower. + + Notes + ------ + In some cases where the input and output coordinates of the source node is not the same (such as reduce nodes) + and fill_output is True, the user may need to specify 'output' as part of the eval call. + """ + + _repr_keys = ["source", "number_of_workers", "chunks"] + source = NodeTrait().tag(attr=True) + chunks = tl.Dict().tag(attr=True) + fill_output = tl.Bool(True).tag(attr=True) + number_of_workers = tl.Int(1).tag(attr=True) + _lock = Lock() + errors = tl.List() + start_i = tl.Int(0) + + def eval(self, coordinates, output=None): + # Make a thread pool to manage queue + pool = ThreadPool(processes=self.number_of_workers) + + if output is None and self.fill_output: + output = self.create_output_array(coordinates) + + shape = [] + for d in coordinates.dims: + if d in self.chunks: + shape.append(self.chunks[d]) + else: + shape.append(coordinates[d].size) + + results = [] + # inputs = [] + i = 0 + for coords, slc in coordinates.iterchunks(shape, True): + # inputs.append(coords) + if i < self.start_i: + _log.debug("Skipping {} since it is less than self.start_i ({})".format(i, self.start_i)) + i += 1 + continue + + out = None + if self.fill_output and output is not None: + out = output[slc] + with self._lock: + _log.debug("Added {} to worker pool".format(i)) + _log.debug("Node eval with coords: {}, {}".format(slc, coords)) + results.append(pool.apply_async(self.eval_source, [coords, slc, out, i])) + i += 1 + + _log.info("Added all chunks to worker pool. Now waiting for results.") + start_time = time.time() + for i, res in enumerate(results): + # _log.debug('Waiting for results: {} {}'.format(i, inputs[i])) + dt = str(np.timedelta64(int(1000 * (time.time() - start_time)), "ms").astype(object)) + _log.info("({}): Waiting for results: {} / {}".format(dt, i + 1, len(results))) + + # Try to get the results / wait for the results + try: + o, slc = res.get() + except Exception as e: + o = None + slc = None + self.errors.append((i, res, e)) + dt = str(np.timedelta64(int(1000 * (time.time() - start_time)), "ms").astype(object)) + _log.warning("({}) {} failed with exception {}".format(dt, i, e)) + + dt = str(np.timedelta64(int(1000 * (time.time() - start_time)), "ms").astype(object)) + _log.info("({}) Finished result: {} / {}".format(time.time() - start_time, i + 1, len(results))) + + # Fill output + if self.fill_output: + if output is None: + missing_dims = [d for d in coordinates.dims if d not in self.chunks.keys()] + coords = coordinates.drop(missing_dims) + missing_coords = Coordinates.from_xarray(o).drop(list(self.chunks.keys())) + coords = merge_dims([coords, missing_coords]) + coords = coords.transpose(*coordinates.dims) + output = self.create_output_array(coords) + output[slc] = o + + _log.info("Completed parallel execution.") + pool.close() + + return output + + def eval_source(self, coordinates, coordinates_index, out, i, source=None): + if source is None: + source = self.source + # Make a copy to prevent any possibility of memory corruption + source = Node.from_definition(source.definition) + + _log.info("Submitting source {}".format(i)) + return (source.eval(coordinates, output=out), coordinates_index) + + +class ParallelAsync(Parallel): + """ + This class launches the parallel node evaluations in threads up to n_workers, and expects the node.eval to return + quickly for parallel execution. This Node was written with aws.Lambda(eval_timeout=1.25) Nodes in mind. + + Users can implement the `check_worker_available` method or specify the `no_worker_exception` attribute, which is an + exception thrown if workers are not available. + + Attributes + ----------- + chunks: dict + Dictionary of dimensions and sizes that will be iterated over. If a dimension is not in this dictionary, the + size of the eval coordinates will be used for the chunk. In this case, it may not be possible to automatically + set the coordinates of missing dimensions in the final file. + fill_output: bool + Default is True. When True, the final results will be assembled and returned to the user. If False, the final + results should be written to a file by specifying the output_format in a Process or Lambda node. + See note below. + source: podpac.Node + The source dataset for the computation + sleep_time: float + Default is 1 second. Number of seconds to sleep between trying to submit new workers + no_worker_exception: Exception, optional + Default is .Exception class used to identify when a submission failed due to no available workers. The default + is chosen to work with the podpac.managers.Lambda node. + async_exception: Exception + Default is botocore.exceptions.ReadTimeoutException. This is an exception thrown by the async function in case + it time out waiting for a return. In our case, this is a success. The default is chosen to work with the + podpac.managers.Lambda node. + Notes + ------ + In some cases where the input and output coordinates of the source node is not the same (such as reduce nodes) + and fill_output is True, the user may need to specify 'output' as part of the eval call. + """ + + source = NodeTrait().tag(attr=True) + chunks = tl.Dict().tag(attr=True) + fill_output = tl.Bool(True).tag(attr=True) + sleep_time = tl.Float(1).tag(attr=True) + no_worker_exception = tl.Type(botocore.exceptions.ClientError).tag(attr=True) + async_exception = tl.Type(botocore.exceptions.ReadTimeoutError).tag(attr=True) + + def check_worker_available(self): + return True + + def eval_source(self, coordinates, coordinates_index, out, i, source=None): + if source is None: + source = self.source + # Make a copy to prevent any possibility of memory corruption + source = Node.from_definition(source.definition) + + success = False + o = None + while not success: + if self.check_worker_available(): + try: + o = source.eval(coordinates, out) + success = True + except self.async_exception: + # This exception is fine and constitutes a success + o = None + success = True + except self.no_worker_exception as e: + response = e.response + if not (response and response.get("Error", {}).get("Code") == "TooManyRequestsException"): + raise e # Raise error again, not the right error + _log.debug("Worker {} exception {}".format(i, e)) + success = False + time.sleep(self.sleep_time) + else: + _log.debug("Worker unavailable for {}".format(i, e)) + time.sleep(self.sleep_time) + _log.info("Submitting source {}".format(i)) + return (o, coordinates_index) + + +class ZarrOutputMixin(tl.HasTraits): + """ + This class assumes that the node has a 'output_format' attribute + (currently the "Lambda" Node, and the "Process" Node) + + Attributes + ----------- + zarr_file: str + Path to the output zarr file that collects all of the computed results. This can reside on S3. + dataset: ZarrGroup + A handle to the zarr group pointing to the output file + fill_output: bool, optional + Default is False (unlike parent class). If True, will collect the output data and return it as an xarray. + init_file_mode: str, optional + Default is 'w'. Mode used for initializing the zarr file. + zarr_chunks: dict + Size of the chunks in the zarr file for each dimension + zarr_shape: dict, optional + Default is the {coordinated.dims: coordinates.shape}, where coordinates used as part of the eval call. This + does not need to be specified unless the Node modifies the input coordinates (as part of a Reduce operation, + for example). The result can be incorrect and requires care/checking by the user. + zarr_coordinates: podpac.Coordinates, optional + Default is None. If the node modifies the shape of the input coordinates, this allows users to set the + coordinates in the output zarr file. This can be incorrect and requires care by the user. + skip_existing: bool + Default is False. If true, this will check to see if the results already exist. And if so, it will not + submit a job for that particular coordinate evaluation. This assumes self.chunks == self.zar_chunks + list_dir: bool, optional + Default is False. If skip_existing is True, by default existing files are checked by asking for an 'exists' call. + If list_dir is True, then at the first opportunity a "list_dir" is performed on the directory and the results + are cached. + """ + + zarr_file = tl.Unicode().tag(attr=True) + dataset = tl.Any() + zarr_node = NodeTrait() + zarr_data_key = tl.Union([tl.Unicode(), tl.List()]) + fill_output = tl.Bool(False) + init_file_mode = tl.Unicode("a").tag(attr=True) + zarr_chunks = tl.Dict(default_value=None, allow_none=True).tag(attr=True) + zarr_shape = tl.Dict(allow_none=True, default_value=None).tag(attr=True) + zarr_coordinates = tl.Instance(Coordinates, allow_none=True, default_value=None).tag(attr=True) + skip_existing = tl.Bool(True).tag(attr=True) + list_dir = tl.Bool(False) + _list_dir = tl.List(allow_none=True, default_value=[]) + _shape = tl.Tuple() + aws_client_kwargs = tl.Dict() + aws_config_kwargs = tl.Dict() + + def eval(self, coordinates, output=None): + if self.zarr_shape is None: + self._shape = coordinates.shape + else: + self._shape = tuple(self.zarr_shape.values()) + + # initialize zarr file + if self.zarr_chunks is None: + chunks = [self.chunks[d] for d in coordinates] + else: + chunks = [self.zarr_chunks[d] for d in coordinates] + zf, data_key, zn = self.initialize_zarr_array(self._shape, chunks) + self.dataset = zf + self.zarr_data_key = data_key + self.zarr_node = zn + zn.keys + + # eval + _log.debug("Starting parallel eval.") + missing_dims = [d for d in coordinates.dims if d not in self.chunks.keys()] + if self.zarr_coordinates is not None: + missing_dims = missing_dims + [d for d in self.zarr_coordinates.dims if d not in missing_dims] + set_coords = merge_dims([coordinates.drop(missing_dims), self.zarr_coordinates]) + else: + set_coords = coordinates.drop(missing_dims) + set_coords.transpose(*coordinates.dims) + + self.set_zarr_coordinates(set_coords, data_key) + if self.list_dir: + dk = data_key + if isinstance(dk, list): + dk = dk[0] + self._list_dir = self.zarr_node.list_dir(dk) + + output = super(ZarrOutputMixin, self).eval(coordinates, output) + + # fill in the coordinates, this is guaranteed to be correct even if the user messed up. + if output is not None: + self.set_zarr_coordinates(Coordinates.from_xarray(output.coords), data_key) + else: + return zf + + return output + + def set_zarr_coordinates(self, coordinates, data_key): + # Fill in metadata + for dk in data_key: + self.dataset[dk].attrs["_ARRAY_DIMENSIONS"] = coordinates.dims + for d in coordinates.dims: + # TODO ADD UNITS AND TIME DECODING INFORMATION + self.dataset.create_dataset(d, shape=coordinates[d].size, overwrite=True) + self.dataset[d][:] = coordinates[d].coordinates + + def initialize_zarr_array(self, shape, chunks): + _log.debug("Creating Zarr file.") + zn = Zarr(source=self.zarr_file, file_mode=self.init_file_mode, aws_client_kwargs=self.aws_client_kwargs) + if self.source.output or getattr(self.source, "data_key", None): + data_key = self.source.output + if data_key is None: + data_key = self.source.data_key + data_key = [data_key] + elif self.source.outputs: + data_key = self.source.outputs + else: + data_key = ["data"] + + zf = zarr.open(zn._get_store(), mode=self.init_file_mode) + + # Intialize the output zarr arrays + for dk in data_key: + try: + arr = zf.create_dataset( + dk, shape=shape, chunks=chunks, fill_value=np.nan, overwrite=not self.skip_existing + ) + except ValueError: + pass # Dataset already exists + + # Recompute any cached properties + zn = Zarr(source=self.zarr_file, file_mode=self.init_file_mode, aws_client_kwargs=self.aws_client_kwargs) + return zf, data_key, zn + + def eval_source(self, coordinates, coordinates_index, out, i, source=None): + if source is None: + source = self.source + + if self.skip_existing: # This section allows previously computed chunks to be skipped + dk = self.zarr_data_key + if isinstance(dk, list): + dk = dk[0] + try: + exists = self.zarr_node.chunk_exists(coordinates_index, data_key=dk, list_dir=self._list_dir) + except ValueError as e: # This was needed in cases where a poor internet connection caused read errors + exists = False + if exists: + _log.info("Skipping {} (already exists)".format(i)) + return out, coordinates_index + + # Make a copy to prevent any possibility of memory corruption + source = Node.from_definition(source.definition) + _log.debug("Creating output format.") + output = dict( + format="zarr_part", + format_kwargs=dict( + part=[[s.start, min(s.stop, self._shape[i]), s.step] for i, s in enumerate(coordinates_index)], + source=self.zarr_file, + mode="a", + ), + ) + _log.debug("Finished creating output format.") + source.set_trait("output_format", output) + _log.debug("output: {}, coordinates.shape: {}".format(output, coordinates.shape)) + _log.debug("Evaluating node.") + + return super(ZarrOutputMixin, self).eval_source(coordinates, coordinates_index, out, i, source) + + +class ParallelOutputZarr(ZarrOutputMixin, Parallel): + pass + + +class ParallelAsyncOutputZarr(ZarrOutputMixin, ParallelAsync): + pass diff --git a/podpac/core/managers/test/test_aws.py b/podpac/core/managers/test/test_aws.py index af9dc2765..b3379bd83 100644 --- a/podpac/core/managers/test/test_aws.py +++ b/podpac/core/managers/test/test_aws.py @@ -3,8 +3,4 @@ class TestAWS(object): - def test_old_module_deprecation(self): - with pytest.warns(DeprecationWarning): - import podpac.core.managers.aws_lambda - - assert podpac.core.managers.aws_lambda.Lambda + pass diff --git a/podpac/core/managers/test/test_multiprocess.py b/podpac/core/managers/test/test_multiprocess.py new file mode 100644 index 000000000..55ef6fa4c --- /dev/null +++ b/podpac/core/managers/test/test_multiprocess.py @@ -0,0 +1,53 @@ +import numpy as np +import pytest + +from multiprocessing import Queue + +from podpac.core.coordinates import Coordinates +from podpac.core.algorithm.utility import Arange +from podpac.core.managers.multi_process import Process, _f + + +class TestProcess(object): + def test_mp_results_the_same(self): + coords = Coordinates([[1, 2, 3, 4, 5]], ["time"]) + node = Arange() + o_sp = node.eval(coords) + + node_mp = Process(source=node) + o_mp = node_mp.eval(coords) + + np.testing.assert_array_equal(o_sp.data, o_mp.data) + + def test_mp_results_outputs(self): + node = Arange(outputs=["a", "b"]) + node_mp = Process(source=node) + assert node.outputs == node_mp.outputs + + def test_mp_results_the_same_set_output(self): + coords = Coordinates([[1, 2, 3, 4, 5]], ["time"]) + node = Arange() + o_sp = node.eval(coords) + output = o_sp.copy() + output[:] = np.nan + + node_mp = Process(source=node) + o_mp = node_mp.eval(coords, output) + + np.testing.assert_array_equal(o_sp, output) + + def test_f(self): + coords = Coordinates([[1, 2, 3, 4, 5]], ["time"]) + node = Arange() + q = Queue() + _f(node.json, coords.json, q, {}) + o = q.get() + np.testing.assert_array_equal(o, node.eval(coords)) + + def test_f_fmt(self): + coords = Coordinates([[1, 2, 3, 4, 5]], ["time"]) + node = Arange() + q = Queue() + _f(node.json, coords.json, q, {"format": "dict", "format_kwargs": {}}) + o = q.get() + np.testing.assert_array_equal(o["data"], node.eval(coords).to_dict()["data"]) diff --git a/podpac/core/managers/test/test_multithreading.py b/podpac/core/managers/test/test_multithreading.py index 1b73b7325..d7a8c4247 100644 --- a/podpac/core/managers/test/test_multithreading.py +++ b/podpac/core/managers/test/test_multithreading.py @@ -1,11 +1,13 @@ -import pytest import os - +import sys import time - -from podpac.core.managers.multi_threading import FakeLock from threading import Thread +import pytest + +from podpac import settings +from podpac.core.managers.multi_threading import FakeLock, thread_manager + class TestFakeLock(object): def test_enter_exist_single_thread(self): @@ -27,11 +29,64 @@ def f(s): print("Unlocked", s) assert lock._locked == False - t1 = Thread(target=lambda: f("thread"), daemon=True) - t2 = Thread(target=lambda: f("thread"), daemon=True) + if sys.version_info.major == 2: + t1 = Thread(target=lambda: f("thread")) + t2 = Thread(target=lambda: f("thread")) + t1.daemon = True + t2.daemon = True + else: + t1 = Thread(target=lambda: f("thread"), daemon=True) + t2 = Thread(target=lambda: f("thread"), daemon=True) print("In Main Thread") f("main1") print("Starting Thread") t1.run() t2.run() f("main2") + + +class TestThreadManager(object): + def test_request_release_threads_single_threaded(self): + with settings: + settings["N_THREADS"] = 5 + # Requests + n = thread_manager.request_n_threads(3) + assert n == 3 + n = thread_manager.request_n_threads(3) + assert n == 2 + n = thread_manager.request_n_threads(3) + assert n == 0 + + # releases + assert thread_manager._n_threads_used == 5 + n = thread_manager.release_n_threads(3) + assert n == 3 + n = thread_manager.release_n_threads(2) + assert n == 5 + n = thread_manager.release_n_threads(50) + assert n == 5 + + def test_request_release_threads_multi_threaded(self): + def f(s): + print("In", s) + n1 = thread_manager.release_n_threads(s) + time.sleep(0.05) + n2 = thread_manager.release_n_threads(s) + print("Released", s) + assert n2 >= n1 + + with settings: + settings["N_THREADS"] = 7 + + if sys.version_info.major == 2: + t1 = Thread(target=lambda: f(5)) + t2 = Thread(target=lambda: f(6)) + t1.daemon = True + t2.daemon = True + else: + t1 = Thread(target=lambda: f(5), daemon=True) + t2 = Thread(target=lambda: f(6), daemon=True) + f(1) + t1.run() + t2.run() + f(7) diff --git a/podpac/core/managers/test/test_parallel.py b/podpac/core/managers/test/test_parallel.py new file mode 100644 index 000000000..b8debe84c --- /dev/null +++ b/podpac/core/managers/test/test_parallel.py @@ -0,0 +1,124 @@ +import os +import shutil +import sys +import time +import numpy as np +from threading import Thread +import tempfile +import logging + +import pytest + +from podpac import settings +from podpac.core.coordinates import Coordinates +from podpac.core.algorithm.utility import CoordData +from podpac.core.managers.parallel import Parallel, ParallelOutputZarr, ParallelAsync, ParallelAsyncOutputZarr +from podpac.core.managers.multi_process import Process + +logger = logging.getLogger("podpac") +logger.setLevel(logging.DEBUG) + + +class TestParallel(object): + def test_parallel_multi_thread_compute_fill_output(self): + node = CoordData(coord_name="time") + coords = Coordinates([[1, 2, 3, 4, 5]], ["time"]) + node_p = Parallel(source=node, number_of_workers=2, chunks={"time": 2}) + o = node.eval(coords) + o_p = node_p.eval(coords) + + np.testing.assert_array_equal(o, o_p) + + def test_parallel_multi_thread_compute_fill_output2(self): + node = CoordData(coord_name="time") + coords = Coordinates([[1, 2, 3, 4, 5]], ["time"]) + node_p = Parallel(source=node, number_of_workers=2, chunks={"time": 2}) + o = node.eval(coords) + o_p = o.copy() + o_p[:] = np.nan + node_p.eval(coords, o_p) + + np.testing.assert_array_equal(o, o_p) + + @pytest.mark.skipif(sys.version < "3.7", reason="python < 3.7 cannot handle processes launched from threads") + def test_parallel_process(self): + node = Process(source=CoordData(coord_name="time")) + coords = Coordinates([[1, 2, 3, 4, 5]], ["time"]) + node_p = Parallel(source=node, number_of_workers=2, chunks={"time": 2}) + o = node.eval(coords) + o_p = o.copy() + o_p[:] = np.nan + node_p.eval(coords, o_p) + time.sleep(0.1) + + np.testing.assert_array_equal(o, o_p) + + +class TestParallelAsync(object): + @pytest.mark.skipif(sys.version < "3.7", reason="python < 3.7 cannot handle processes launched from threads") + def test_parallel_process_async(self): + node = Process(source=CoordData(coord_name="time")) # , block=False) + coords = Coordinates([[1, 2, 3, 4, 5]], ["time"]) + node_p = ParallelAsync(source=node, number_of_workers=2, chunks={"time": 2}, fill_output=False) + node_p.eval(coords) + time.sleep(0.1) + # Just try to make it run... + + +class TestParallelOutputZarr(object): + @pytest.mark.skipif(sys.version < "3.7", reason="python < 3.7 cannot handle processes launched from threads") + def test_parallel_process_zarr(self): + # Can't use tempfile.TemporaryDirectory because multiple processess need access to dir + tmpdir = os.path.join(tempfile.gettempdir(), "test_parallel_process_zarr.zarr") + + node = Process(source=CoordData(coord_name="time")) # , block=False) + coords = Coordinates([[1, 2, 3, 4, 5]], ["time"]) + node_p = ParallelOutputZarr( + source=node, number_of_workers=2, chunks={"time": 2}, fill_output=False, zarr_file=tmpdir + ) + o_zarr = node_p.eval(coords) + time.sleep(0.1) + # print(o_zarr.info) + np.testing.assert_array_equal([1, 2, 3, 4, 5], o_zarr["data"][:]) + + shutil.rmtree(tmpdir) + + @pytest.mark.skipif(sys.version < "3.7", reason="python < 3.7 cannot handle processes launched from threads") + def test_parallel_process_zarr_async(self): + # Can't use tempfile.TemporaryDirectory because multiple processess need access to dir + tmpdir = os.path.join(tempfile.gettempdir(), "test_parallel_process_zarr_async.zarr") + + node = Process(source=CoordData(coord_name="time")) # , block=False) + coords = Coordinates([[1, 2, 3, 4, 5]], ["time"]) + node_p = ParallelAsyncOutputZarr( + source=node, number_of_workers=5, chunks={"time": 2}, fill_output=False, zarr_file=tmpdir + ) + o_zarr = node_p.eval(coords) + # print(o_zarr.info) + time.sleep(0.01) + np.testing.assert_array_equal([1, 2, 3, 4, 5], o_zarr["data"][:]) + + shutil.rmtree(tmpdir) + + @pytest.mark.skipif(sys.version < "3.7", reason="python < 3.7 cannot handle processes launched from threads") + def test_parallel_process_zarr_async_starti(self): + # Can't use tempfile.TemporaryDirectory because multiple processess need access to dir + tmpdir = os.path.join(tempfile.gettempdir(), "test_parallel_process_zarr_async_starti.zarr") + + node = Process(source=CoordData(coord_name="time")) # , block=False) + coords = Coordinates([[1, 2, 3, 4, 5]], ["time"]) + node_p = ParallelAsyncOutputZarr( + source=node, number_of_workers=5, chunks={"time": 2}, fill_output=False, zarr_file=tmpdir, start_i=1 + ) + o_zarr = node_p.eval(coords) + # print(o_zarr.info) + time.sleep(0.01) + np.testing.assert_array_equal([np.nan, np.nan, 3, 4, 5], o_zarr["data"][:]) + + node_p = ParallelAsyncOutputZarr( + source=node, number_of_workers=5, chunks={"time": 2}, fill_output=False, zarr_file=tmpdir, start_i=0 + ) + o_zarr = node_p.eval(coords) + np.testing.assert_array_equal([1, 2, 3, 4, 5], o_zarr["data"][:]) + + shutil.rmtree(tmpdir) diff --git a/podpac/core/node.py b/podpac/core/node.py index 28ab48818..f4a4a0129 100644 --- a/podpac/core/node.py +++ b/podpac/core/node.py @@ -9,21 +9,26 @@ import json import inspect import importlib +import warnings from collections import OrderedDict from copy import deepcopy from hashlib import md5 as hash_alg import numpy as np import traitlets as tl +import six +import podpac from podpac.core.settings import settings from podpac.core.units import ureg, UnitsDataArray from podpac.core.utils import common_doc -from podpac.core.utils import JSONEncoder, is_json_serializable +from podpac.core.utils import JSONEncoder +from podpac.core.utils import cached_property +from podpac.core.utils import trait_is_defined from podpac.core.utils import _get_query_params_from_url, _get_from_url, _get_param from podpac.core.coordinates import Coordinates from podpac.core.style import Style -from podpac.core.cache import CacheCtrl, get_default_cache_ctrl, S3CacheStore, make_cache_ctrl +from podpac.core.cache import CacheCtrl, get_default_cache_ctrl, make_cache_ctrl, S3CacheStore, DiskCacheStore from podpac.core.managers.multi_threading import thread_manager @@ -37,7 +42,7 @@ Unit-aware xarray DataArray containing the results of the node evaluation. """, "hash_return": "A unique hash capturing the coordinates and parameters used to evaluate the node. ", - "outdir": "Optional output directory. Uses :attr:`podpac.settings['DISK_CACHE_DIR']` by default", + "outdir": "Optional output directory. Uses :attr:`podpac.settings[.cache_path` by default", "definition_return": """ OrderedDict Dictionary containing the location of the Node, the name of the plugin (if required), as well as any @@ -70,7 +75,13 @@ class NodeException(Exception): - """ Summary """ + """ Base class for exceptions when using podpac nodes """ + + pass + + +class NodeDefinitionError(NodeException): + """ Raised node definition errors, such as when the definition is circular or is not yet unavailable. """ pass @@ -82,9 +93,12 @@ class Node(tl.HasTraits): Attributes ---------- cache_output: bool - Should the node's output be cached? If not provided or None, uses default based on settings. + Should the node's output be cached? If not provided or None, uses default based on settings + (CACHE_NODE_OUTPUT_DEFAULT for general Nodes, and CACHE_DATASOURCE_OUTPUT_DEFAULT for DataSource nodes). + If True, outputs will be cached and retrieved from cache. If False, outputs will not be cached OR retrieved from cache (even if + they exist in cache). cache_update: bool - Default is False. Should the node's cached output be updated from the source data? + Default is True. Should the node's cached output be updated from the source data? cache_ctrl: :class:`podpac.core.cache.cache.CacheCtrl` Class that controls caching. If not provided, uses default based on settings. dtype : type @@ -111,11 +125,16 @@ class Node(tl.HasTraits): outputs = tl.List(tl.Unicode, allow_none=True).tag(attr=True) output = tl.Unicode(default_value=None, allow_none=True).tag(attr=True) units = tl.Unicode(default_value=None, allow_none=True).tag(attr=True) + style = tl.Instance(Style) + dtype = tl.Any(default_value=float) cache_output = tl.Bool() cache_update = tl.Bool(False) cache_ctrl = tl.Instance(CacheCtrl, allow_none=True) - style = tl.Instance(Style) + + # list of attribute names, used by __repr__ and __str__ to display minimal info about the node + # e.g. data sources use ['source'] + _repr_keys = [] @tl.default("outputs") def _default_outputs(self): @@ -130,22 +149,8 @@ def _validate_output(self, d): raise ValueError("Invalid output '%s' (available outputs are %s)" % (self.output, self.outputs)) return d["value"] - @tl.default("cache_output") - def _cache_output_default(self): - return settings["CACHE_OUTPUT_DEFAULT"] - - @tl.default("cache_ctrl") - def _cache_ctrl_default(self): - return get_default_cache_ctrl() - - @tl.validate("cache_ctrl") - def _validate_cache_ctrl(self, d): - if d["value"] is None: - d["value"] = CacheCtrl([]) # no cache_stores - return d["value"] - @tl.default("style") - def _style_default(self): + def _default_style(self): return Style() @tl.validate("units") @@ -153,6 +158,14 @@ def _validate_units(self, d): ureg.Unit(d["value"]) # will throw an exception if this is not a valid pint Unit return d["value"] + @tl.default("cache_output") + def _cache_output_default(self): + return settings["CACHE_NODE_OUTPUT_DEFAULT"] + + @tl.default("cache_ctrl") + def _cache_ctrl_default(self): + return get_default_cache_ctrl() + # debugging _requested_coordinates = tl.Instance(Coordinates, allow_none=True) _output = tl.Instance(UnitsDataArray, allow_none=True) @@ -160,6 +173,10 @@ def _validate_units(self, d): # Flag that is True if the Node was run multi-threaded, or None if the question doesn't apply _multi_threaded = tl.Bool(allow_none=True, default_value=None) + # util + _definition_guard = False + _traits_initialized_guard = False + def __init__(self, **kwargs): """ Do not overwrite me """ @@ -174,13 +191,18 @@ def __init__(self, **kwargs): # on subsequent initializations, they will already be read_only. with self.hold_trait_notifications(): for name, trait in self.traits().items(): - if trait.metadata.get("readonly") or trait.metadata.get("attr"): + if settings["DEBUG"]: + trait.read_only = False + elif trait.metadata.get("readonly") or trait.metadata.get("attr"): if name in tkwargs: self.set_trait(name, tkwargs.pop(name)) trait.read_only = True - # Call traitlest constructor - super(Node, self).__init__(**tkwargs) + # Call traitlets constructor + super(Node, self).__init__(**tkwargs) + + self._traits_initialized_guard = True + self.init() def _first_init(self, **kwargs): @@ -203,6 +225,28 @@ def init(self): """ pass + @property + def attrs(self): + """List of node attributes""" + return [name for name in self.traits() if self.trait_metadata(name, "attr")] + + @property + def _repr_info(self): + keys = self._repr_keys[:] + if self.trait_is_defined("output") and self.output is not None: + if "output" not in keys: + keys.append("output") + elif self.trait_is_defined("outputs") and self.outputs is not None: + if "outputs" not in keys: + keys.append("outputs") + return ", ".join("%s=%s" % (key, repr(getattr(self, key))) for key in keys) + + def __repr__(self): + return "<%s(%s)>" % (self.__class__.__name__, self._repr_info) + + def __str__(self): + return "<%s(%s) attrs: %s>" % (self.__class__.__name__, self._repr_info, ", ".join(self.attrs)) + @common_doc(COMMON_DOC) def eval(self, coordinates, output=None): """ @@ -241,7 +285,7 @@ def eval_group(self, group): def find_coordinates(self): """ - Get all available native coordinates for the Node. Implemented in child classes. + Get all available coordinates for the Node. Implemented in child classes. Returns ------- @@ -282,6 +326,9 @@ def create_output_array(self, coords, data=np.nan, **kwargs): return UnitsDataArray.create(coords, data=data, outputs=self.outputs, dtype=self.dtype, attrs=attrs, **kwargs) + def trait_is_defined(self, name): + return trait_is_defined(self, name) + # ----------------------------------------------------------------------------------------------------------------- # Serialization # ----------------------------------------------------------------------------------------------------------------- @@ -299,21 +346,10 @@ def base_ref(self): return self.__class__.__name__ @property - def base_definition(self): - """ - Base node definition. - - This property is implemented in the primary base nodes (DataSource, Algorithm, and Compositor). Node - subclasses with additional attrs will need to extend this property. - - Returns - ------- - {definition_return} - - """ - + def _base_definition(self): d = OrderedDict() + # node and plugin if self.__module__ == "podpac": d["node"] = self.__class__.__name__ elif self.__module__.startswith("podpac."): @@ -323,45 +359,43 @@ def base_definition(self): d["plugin"] = self.__module__ d["node"] = self.__class__.__name__ + # attrs/inputs attrs = {} - lookup_attrs = {} - - for key, value in self.traits().items(): - if not value.metadata.get("attr", False): - continue - - attr = getattr(self, key) + inputs = {} + for name in self.attrs: + value = getattr(self, name) + + if ( + isinstance(value, Node) + or (isinstance(value, (list, tuple, np.ndarray)) and all(isinstance(elem, Node) for elem in value)) + or (isinstance(value, dict) and all(isinstance(elem, Node) for elem in value.values())) + ): + inputs[name] = value + else: + attrs[name] = value - if key == "units" and attr is None: - continue + if "units" in attrs and attrs["units"] is None: + del attrs["units"] - # check serializable - if not is_json_serializable(attr, cls=JSONEncoder): - raise NodeException("Cannot serialize attr '%s' with type '%s'" % (key, type(attr))) + if "outputs" in attrs and attrs["outputs"] is None: + del attrs["outputs"] - if isinstance(attr, Node): - lookup_attrs[key] = attr - else: - attrs[key] = attr + if "output" in attrs and attrs["output"] is None: + del attrs["output"] if attrs: - # remove unnecessary attrs - if self.outputs is None and "outputs" in attrs: - del attrs["outputs"] - if self.output is None and "output" in attrs: - del attrs["output"] + d["attrs"] = attrs - d["attrs"] = OrderedDict([(key, attrs[key]) for key in sorted(attrs.keys())]) - - if lookup_attrs: - d["lookup_attrs"] = OrderedDict([(key, lookup_attrs[key]) for key in sorted(lookup_attrs.keys())]) + if inputs: + d["inputs"] = inputs + # style if self.style.definition: d["style"] = self.style.definition return d - @property + @cached_property def definition(self): """ Full node definition. @@ -372,84 +406,104 @@ def definition(self): Dictionary-formatted node definition. """ - nodes = [] - refs = [] - definitions = [] - - def add_node(node): - for ref, n in zip(refs, nodes): - if node.hash == n.hash: - return ref - - # get base definition and then replace nodes with references, adding nodes depth first - d = node.base_definition - if "lookup_source" in d: - d["lookup_source"] = add_node(d["lookup_source"]) - if "lookup_attrs" in d: - for key, attr_node in d["lookup_attrs"].items(): - d["lookup_attrs"][key] = add_node(attr_node) - if "inputs" in d: - for key, input_node in d["inputs"].items(): - if input_node is not None: - d["inputs"][key] = add_node(input_node) - if "sources" in d: - sources = [] # we need this list so that we don't overwrite the actual sources array - for i, source_node in enumerate(d["sources"]): - sources.append(add_node(source_node)) - d["sources"] = sources - - # get base ref and then ensure it is unique - ref = node.base_ref - while ref in refs: - if re.search("_[1-9][0-9]*$", ref): - ref, i = ref.rsplit("_", 1) - i = int(i) - else: - i = 0 - ref = "%s_%d" % (ref, i + 1) - - nodes.append(node) - refs.append(ref) - definitions.append(d) - - return ref - - add_node(self) - - return OrderedDict(zip(refs, definitions)) + if getattr(self, "_definition_guard", False): + raise NodeDefinitionError("node definition has a circular dependency") - @property - def pipeline(self): - """Deprecated. See Node.definition and Node.from_definition.""" - from podpac.core.pipeline import Pipeline + if not getattr(self, "_traits_initialized_guard", False): + raise NodeDefinitionError("node is not yet fully initialized") - return Pipeline(definition=OrderedDict({"nodes": self.definition})) + try: + self._definition_guard = True - @property + nodes = [] + refs = [] + definitions = [] + + def add_node(node): + for ref, n in zip(refs, nodes): + if node == n: + return ref + + # get base definition + d = node._base_definition + + if "inputs" in d: + # sort and shallow copy + d["inputs"] = OrderedDict([(key, d["inputs"][key]) for key in sorted(d["inputs"].keys())]) + + # replace nodes with references, adding nodes depth first + for key, value in d["inputs"].items(): + if isinstance(value, Node): + d["inputs"][key] = add_node(value) + elif isinstance(value, (list, tuple, np.ndarray)): + d["inputs"][key] = [add_node(item) for item in value] + elif isinstance(value, dict): + d["inputs"][key] = {k: add_node(v) for k, v in value.items()} + else: + raise TypeError("Invalid input '%s' of type '%s': %s" % (key, type(value))) + + if "attrs" in d: + # sort and shallow copy + d["attrs"] = OrderedDict([(key, d["attrs"][key]) for key in sorted(d["attrs"].keys())]) + + # get base ref and then ensure it is unique + ref = node.base_ref + while ref in refs: + if re.search("_[1-9][0-9]*$", ref): + ref, i = ref.rsplit("_", 1) + i = int(i) + else: + i = 0 + ref = "%s_%d" % (ref, i + 1) + + nodes.append(node) + refs.append(ref) + definitions.append(d) + + return ref + + # add top level node + add_node(self) + + # finalize, verify serializable, and return + definition = OrderedDict(zip(refs, definitions)) + definition["podpac_version"] = podpac.__version__ + json.dumps(definition, cls=JSONEncoder) + return definition + + finally: + self._definition_guard = False + + @cached_property def json(self): - """definition for this node in json format + """Definition for this node in JSON format.""" - Returns - ------- - str - JSON-formatted node definition. - """ return json.dumps(self.definition, separators=(",", ":"), cls=JSONEncoder) - @property + @cached_property def json_pretty(self): + """Definition for this node in JSON format, with indentation suitable for display.""" + return json.dumps(self.definition, indent=4, cls=JSONEncoder) - @property + @cached_property def hash(self): - # Style should not be part of the hash - defn = self.json + """ hash for this node, used in caching and to determine equality. """ + + # deepcopy so that the cached definition property is not modified by the deletes below + d = deepcopy(self.definition) + + # omit version + if "podpac_version" in d: + del d["podpac_version"] - # Note: this ONLY works because the Style node has NO dictionaries as part - # of its attributes - hashstr = re.sub(r'"style":\{.*?\},?', "", defn) + # omit style in every node + for k in d: + if "style" in d[k]: + del d[k]["style"] - return hash_alg(hashstr.encode("utf-8")).hexdigest() + s = json.dumps(d, separators=(",", ":"), cls=JSONEncoder) + return hash_alg(s.encode("utf-8")).hexdigest() def save(self, path): """ @@ -468,6 +522,16 @@ def save(self, path): with open(path, "w") as f: json.dump(self.definition, f, separators=(",", ":"), cls=JSONEncoder) + def __eq__(self, other): + if not isinstance(other, Node): + return False + return self.hash == other.hash + + def __ne__(self, other): + if not isinstance(other, Node): + return True + return self.hash != other.hash + # ----------------------------------------------------------------------------------------------------------------- # Caching Interface # ----------------------------------------------------------------------------------------------------------------- @@ -494,12 +558,17 @@ def get_cache(self, key, coordinates=None): Cached data not found. """ - if not self.has_cache(key, coordinates=coordinates): + try: + self.definition + except NodeDefinitionError as e: + raise NodeException("Cache unavailable, %s (key='%s')" % (e.args[0], key)) + + if self.cache_ctrl is None or not self.has_cache(key, coordinates=coordinates): raise NodeException("cached data not found for key '%s' and coordinates %s" % (key, coordinates)) return self.cache_ctrl.get(self, key, coordinates=coordinates) - def put_cache(self, data, key, coordinates=None, overwrite=False): + def put_cache(self, data, key, coordinates=None, overwrite=True): """ Cache data for this node. @@ -512,13 +581,22 @@ def put_cache(self, data, key, coordinates=None, overwrite=False): coordinates : podpac.Coordinates, optional Coordinates that the cached data depends on. Omit for coordinate-independent data. overwrite : bool, optional - Overwrite existing data, default False + Overwrite existing data, default True. Raises ------ NodeException Cached data already exists (and overwrite is False) """ + + try: + self.definition + except NodeDefinitionError as e: + raise NodeException("Cache unavailable, %s (key='%s')" % (e.args[0], key)) + + if self.cache_ctrl is None: + return + if not overwrite and self.has_cache(key, coordinates=coordinates): raise NodeException("Cached data already exists for key '%s' and coordinates %s" % (key, coordinates)) @@ -541,10 +619,19 @@ def has_cache(self, key, coordinates=None): bool True if there is cached data for this node, key, and coordinates. """ + + try: + self.definition + except NodeDefinitionError as e: + raise NodeException("Cache unavailable, %s (key='%s')" % (e.args[0], key)) + + if self.cache_ctrl is None: + return False + with thread_manager.cache_lock: return self.cache_ctrl.has(self, key, coordinates=coordinates) - def rem_cache(self, key, coordinates=None, mode=None): + def rem_cache(self, key, coordinates=None, mode="all"): """ Clear cached data for this node. @@ -556,13 +643,22 @@ def rem_cache(self, key, coordinates=None, mode=None): Default is None. Delete cached objects for these coordinates. If `'*'`, cached data is deleted for all coordinates, including coordinate-independent data. If None, will only affect coordinate-independent data. mode: str, optional - Specify which cache stores are affected. + Specify which cache stores are affected. Default 'all'. See Also --------- `podpac.core.cache.cache.CacheCtrl.clear` to remove ALL cache for ALL nodes. """ + + try: + self.definition + except NodeDefinitionError as e: + raise NodeException("Cache unavailable, %s (key='%s')" % (e.args[0], key)) + + if self.cache_ctrl is None: + return + self.cache_ctrl.rem(self, key=key, coordinates=coordinates, mode=mode) # --------------------------------------------------------# @@ -591,9 +687,12 @@ def from_definition(cls, definition): load : create a node from file """ - from podpac.core.data.datasource import DataSource - from podpac.core.algorithm.algorithm import BaseAlgorithm - from podpac.core.compositor import Compositor + if "podpac_version" in definition and definition["podpac_version"] != podpac.__version__: + warnings.warn( + "node definition version mismatch " + "(this node was created with podpac version '%s', " + "but your current podpac version is '%s')" % (definition["podpac_version"], podpac.__version__) + ) if len(definition) == 0: raise ValueError("Invalid definition: definition cannot be empty.") @@ -601,6 +700,9 @@ def from_definition(cls, definition): # parse node definitions in order nodes = OrderedDict() for name, d in definition.items(): + if name == "podpac_version": + continue + if "node" not in d: raise ValueError("Invalid definition for node '%s': 'node' property required" % name) @@ -622,84 +724,20 @@ def from_definition(cls, definition): # parse and configure kwargs kwargs = {} - whitelist = ["node", "attrs", "lookup_attrs", "plugin", "style"] - - # DataSource, Compositor, and Algorithm specific properties - parents = inspect.getmro(node_class) - - if DataSource in parents: - if "attrs" in d: - if "source" in d["attrs"]: - raise ValueError( - "Invalid definition for node '%s': DataSource 'attrs' cannot have a 'source' property." - % name - ) - - if "lookup_source" in d["attrs"]: - raise ValueError( - "Invalid definition for node '%s': DataSource 'attrs' cannot have a 'lookup_source' property" - % name - ) - - if "interpolation" in d["attrs"]: - raise ValueError( - "Invalid definition for node '%s': DataSource 'attrs' cannot have an 'interpolation' property" - % name - ) - - if "source" in d: - kwargs["source"] = d["source"] - whitelist.append("source") - - elif "lookup_source" in d: - kwargs["source"] = _get_subattr(nodes, name, d["lookup_source"]) - whitelist.append("lookup_source") - - if "interpolation" in d: - kwargs["interpolation"] = d["interpolation"] - whitelist.append("interpolation") - - if Compositor in parents: - if "attrs" in d: - if "interpolation" in d["attrs"]: - raise ValueError( - "Invalid definition for node '%s': Compositor 'attrs' cannot have an 'interpolation' property" - % name - ) - - if "sources" in d: - sources = [_get_subattr(nodes, name, source) for source in d["sources"]] - kwargs["sources"] = np.array(sources) - whitelist.append("sources") - - if "interpolation" in d: - kwargs["interpolation"] = d["interpolation"] - whitelist.append("interpolation") - - if BaseAlgorithm in parents: - if "attrs" in d: - if "inputs" in d["attrs"]: - raise ValueError( - "Invalid definition for node '%s': Algorithm 'attrs' cannot have an 'inputs' property" - % name - ) - - if "inputs" in d: - inputs = {k: _get_subattr(nodes, name, v) for k, v in d["inputs"].items()} - kwargs.update(inputs) - whitelist.append("inputs") - for k, v in d.get("attrs", {}).items(): kwargs[k] = v + for k, v in d.get("inputs", {}).items(): + kwargs[k] = _lookup_input(nodes, name, v) + for k, v in d.get("lookup_attrs", {}).items(): - kwargs[k] = _get_subattr(nodes, name, v) + kwargs[k] = _lookup_attr(nodes, name, v) if "style" in d: kwargs["style"] = Style.from_definition(d["style"]) for k in d: - if k not in whitelist: + if k not in ["node", "inputs", "attrs", "lookup_attrs", "plugin", "style"]: raise ValueError("Invalid definition for node '%s': unexpected property '%s'" % (name, k)) nodes[name] = node_class(**kwargs) @@ -810,19 +848,103 @@ def from_url(cls, url): return cls.from_definition(d) -def _get_subattr(nodes, name, ref): - refs = ref.split(".") - try: - attr = nodes[refs[0]] - for _name in refs[1:]: - attr = getattr(attr, _name) - except (KeyError, AttributeError): - raise ValueError("Invalid definition for node '%s': reference to nonexistent node/attribute '%s'" % (name, ref)) +def _lookup_input(nodes, name, value): + # containers + if isinstance(value, list): + return [_lookup_input(nodes, name, elem) for elem in value] + + if isinstance(value, dict): + return {k: _lookup_input(nodes, name, v) for k, v in value.items()} + + # node reference + if not isinstance(value, six.string_types): + raise ValueError( + "Invalid definition for node '%s': invalid reference '%s' of type '%s' in inputs" + % (name, value, type(value)) + ) + + if not value in nodes: + raise ValueError( + "Invalid definition for node '%s': reference to nonexistent node '%s' in inputs" % (name, value) + ) + + node = nodes[value] + + # copy in debug mode + if settings["DEBUG"]: + node = deepcopy(node) + + return node + + +def _lookup_attr(nodes, name, value): + # containers + if isinstance(value, list): + return [_lookup_attr(nodes, name, elem) for elem in value] + + if isinstance(value, dict): + return {_k: _lookup_attr(nodes, name, v) for k, v in value.items()} + + if not isinstance(value, six.string_types): + raise ValueError( + "Invalid definition for node '%s': invalid reference '%s' of type '%s' in lookup_attrs" + % (name, value, type(value)) + ) + + # node + elems = value.split(".") + if elems[0] not in nodes: + raise ValueError( + "Invalid definition for node '%s': reference to nonexistent node '%s' in lookup_attrs" % (name, elems[0]) + ) + + # subattrs + attr = nodes[elems[0]] + for n in elems[1:]: + if not hasattr(attr, n): + raise ValueError( + "Invalid definition for node '%s': reference to nonexistent attribute '%s' in lookup_attrs value '%s" + % (name, n, value) + ) + attr = getattr(attr, n) + + # copy in debug mode if settings["DEBUG"]: attr = deepcopy(attr) + return attr +# --------------------------------------------------------# +# Mixins +# --------------------------------------------------------# + + +class NoCacheMixin(tl.HasTraits): + """ Mixin to use no cache by default. """ + + cache_ctrl = tl.Instance(CacheCtrl, allow_none=True) + + @tl.default("cache_ctrl") + def _cache_ctrl_default(self): + return CacheCtrl([]) + + +class DiskCacheMixin(tl.HasTraits): + """ Mixin to add disk caching to the Node by default. """ + + cache_ctrl = tl.Instance(CacheCtrl, allow_none=True) + + @tl.default("cache_ctrl") + def _cache_ctrl_default(self): + # get the default cache_ctrl and addd a disk cache store if necessary + default_ctrl = get_default_cache_ctrl() + stores = default_ctrl._cache_stores + if not any(isinstance(store, DiskCacheStore) for store in default_ctrl._cache_stores): + stores.append(DiskCacheStore()) + return CacheCtrl(stores) + + # --------------------------------------------------------# # Decorators # --------------------------------------------------------# @@ -850,7 +972,7 @@ def wrapper(self, coordinates, output=None): key = cache_key cache_coordinates = coordinates.transpose(*sorted(coordinates.dims)) # order agnostic caching - if not self.cache_update and self.has_cache(key, cache_coordinates): + if not self.cache_update and self.cache_output and self.has_cache(key, cache_coordinates): data = self.get_cache(key, cache_coordinates) if output is not None: order = [dim for dim in output.dims if dim not in data.dims] + list(data.dims) @@ -858,11 +980,8 @@ def wrapper(self, coordinates, output=None): self._from_cache = True else: data = fn(self, coordinates, output=output) - - # We need to check if the cache now has the key because it is possible that - # the previous function call added the key with the coordinates to the cache - if self.cache_output and not (self.has_cache(key, cache_coordinates) and not self.cache_update): - self.put_cache(data, key, cache_coordinates, overwrite=self.cache_update) + if self.cache_output: + self.put_cache(data, key, cache_coordinates) self._from_cache = False # extract single output, if necessary @@ -874,7 +993,7 @@ def wrapper(self, coordinates, output=None): order = [dim for dim in coordinates.idims if dim in data.dims] if "output" in data.dims: order.append("output") - data = data.transpose(*order) + data = data.transpose(*order, transpose_coords=False) if settings["DEBUG"]: self._output = data @@ -885,99 +1004,3 @@ def wrapper(self, coordinates, output=None): return data return wrapper - - -def cache_func(key, depends=None): - """ - Decorating for caching a function's output based on a key. - - Parameters - ----------- - key: str - Key used for caching. - depends: str, list, traitlets.All (optional) - Default is None. Any traits that the cached property depends on. The cached function may NOT - change the value of any of these dependencies (this will result in a RecursionError) - - - Notes - ----- - This decorator cannot handle function input parameters. - - If the function uses any tagged attributes, these will essentially operate like dependencies - because the cache key changes based on the node definition, which is affected by tagged attributes. - - Examples - ---------- - >>> from podpac import Node - >>> from podpac.core.node import cache_func - >>> import traitlets as tl - >>> class MyClass(Node): - value = tl.Int(0) - @cache_func('add') - def add_value(self): - self.value += 1 - return self.value - @cache_func('square', depends='value') - def square_value_depends(self): - return self.value - - >>> n = MyClass(cache_ctrl=None) - >>> n.add_value() # The function as defined is called - 1 - >>> n.add_value() # The function as defined is called again, since we have specified no caching - 2 - >>> n.cache_ctrl = CacheCtrl([RamCacheStore()]) - >>> n.add_value() # The function as defined is called again, and the value is stored in memory - 3 - >>> n.add_value() # The value is retrieved from disk, note the change in n.value is not captured - 3 - >>> n.square_value_depends() # The function as defined is called, and the value is stored in memory - 16 - >>> n.square_value_depends() # The value is retrieved from memory - 16 - >>> n.value += 1 - >>> n.square_value_depends() # The function as defined is called, and the value is stored in memory. Note the change in n.value is captured. - 25 - """ - # This is the actual decorator which will be evaluated and returns the wrapped function - def cache_decorator(func): - # This is the initial wrapper that sets up the observations - @functools.wraps(func) - def cache_wrapper(self): - # This is the function that updates the cached based on observed traits - def cache_updator(change): - # print("Updating value on self:", id(self)) - out = func(self) - self.put_cache(out, key, overwrite=True) - - if depends: - # This sets up the observer on the dependent traits - # print ("setting up observer on self: ", id(self)) - self.observe(cache_updator, depends) - # Since attributes could change on instantiation, anything we previously - # stored is likely out of date. So, force and update to the cache. - cache_updator(None) - - # This is the final wrapper the continues to fetch data from cache - # after the observer has been set up. - @functools.wraps(func) - def cached_function(): - try: - out = self.get_cache(key) - except NodeException: - out = func(self) - self.put_cache(out, key) - return out - - # Since this is the first time the function is run, set the new wrapper - # on the class instance so that the current function won't be called again - # (which would set up an additional observer) - setattr(self, func.__name__, cached_function) - - # Return the value on the first run - return cached_function() - - return cache_wrapper - - return cache_decorator diff --git a/podpac/core/pipeline/__init__.py b/podpac/core/pipeline/__init__.py deleted file mode 100644 index 7df8cb341..000000000 --- a/podpac/core/pipeline/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from podpac.core.pipeline.pipeline import Pipeline, PipelineError diff --git a/podpac/core/pipeline/output.py b/podpac/core/pipeline/output.py deleted file mode 100644 index f61b95772..000000000 --- a/podpac/core/pipeline/output.py +++ /dev/null @@ -1,212 +0,0 @@ -""" -Pipeline output Summary -""" - -from __future__ import absolute_import, division, print_function, unicode_literals - -import os -import warnings -from collections import OrderedDict -from io import BytesIO - -try: - import cPickle # Python 2.7 -except: - import _pickle as cPickle - -import traitlets as tl -import numpy as np -import traitlets as tl - -from podpac.core.node import Node - - -class Output(tl.HasTraits): - """ - Base class for Pipeline Outputs. - - Attributes - ---------- - node : Node - output node - name : string - output name - """ - - node = tl.Instance(Node) - name = tl.Unicode() - - def write(self, output, coordinates): - """Write the node output - - Arguments - --------- - output : UnitsDataArray - Node evaluation output to write - coordinates : Coordinates - Evaluated coordinates. - - Raises - ------ - NotImplementedError - Description - """ - raise NotImplementedError - - @property - def definition(self): - d = OrderedDict() - for key, value in self.traits().items(): - if value.metadata.get("attr", False): - d[key] = getattr(self, key) - return d - - -class NoOutput(Output): - """ No Output """ - - def __init__(self, node, name): - super(NoOutput, self).__init__(node=node, name=name) - - def write(self, output, coordinates): - pass - - -class FileOutput(Output): - """ Output a file to the local filesystem. - - Attributes - ---------- - format : TYPE - Description - outdir : TYPE - Description - """ - - outdir = tl.Unicode() - format = tl.CaselessStrEnum(values=["pickle", "geotif", "png", "nc", "json"], default_value="pickle").tag(attr=True) - mode = tl.Unicode(default_value="file").tag(attr=True) - - _path = tl.Unicode(allow_none=True, default_value=None) - - def __init__(self, node, name, format=None, outdir=None, mode=None): - kwargs = {} - if format is not None: - kwargs["format"] = format - if outdir is not None: - kwargs["outdir"] = outdir - if mode is not None: - kwargs["mode"] = mode - super(FileOutput, self).__init__(node=node, name=name, **kwargs) - - @property - def path(self): - return self._path - - # TODO: docstring? - def write(self, output, coordinates): - filename = "%s_%s_%s" % (self.name, self.node.hash, coordinates.hash) - path = os.path.join(self.outdir, filename) - - if self.format == "pickle": - path = "%s.pkl" % path - with open(path, "wb") as f: - cPickle.dump(output, f) - elif self.format == "png": - raise NotImplementedError("format '%s' not yet implemented" % self.format) - elif self.format == "geotif": - raise NotImplementedError("format '%s' not yet implemented" % self.format) - elif self.format == "nc": - raise NotImplementedError("format '%s' not yet implemented" % self.format) - elif self.format == "json": - raise NotImplementedError("format '$s' not yet implemented" % self.format) - - self._path = path - - -class FTPOutput(Output): - """Output a file and send over FTP. - - Attributes - ---------- - url : TYPE - Description - user : TYPE - Description - """ - - url = tl.Unicode() - user = tl.Unicode() - pw = tl.Unicode() - - def __init__(self, node, name, url=None, user=None, pw=None): - kwargs = {} - if url is not None: - kwargs["url"] = url - if user is not None: - kwargs["user"] = user - if pw is not None: - kwargs["pw"] = pw - super(FTPOutput, self).__init__(node=node, name=name, **kwargs) - - -class S3Output(Output): - """Output a file and send to S3 - - Attributes - ---------- - bucket : TYPE - Description - user : TYPE - Description - """ - - bucket = tl.Unicode() - user = tl.Unicode() - - def __init__(self, node, name, bucket=None, user=None): - kwargs = {} - if bucket is not None: - kwargs["bucket"] = bucket - if user is not None: - kwargs["user"] = user - super(S3Output, self).__init__(node=node, name=name, **kwargs) - - -class ImageOutput(Output): - """Output an image in RAM - - Attributes - ---------- - format : TYPE - Description - image : TYPE - Description - vmax : TYPE - Description - vmin : TYPE - Description - """ - - format = tl.CaselessStrEnum(values=["png"], default_value="png").tag(attr=True) - mode = tl.Unicode(default_value="image").tag(attr=True) - vmin = tl.CFloat(allow_none=True, default_value=np.nan).tag(attr=True) - vmax = tl.CFloat(allow_none=True, default_value=np.nan).tag(attr=True) - image = tl.Bytes(allow_none=True, default_value=None) - - def __init__(self, node, name, format=None, mode=None, vmin=None, vmax=None): - kwargs = {} - if format is not None: - kwargs["format"] = format - if mode is not None: - kwargs["mode"] = mode - if vmin is not None: - kwargs["vmin"] = vmin - if vmax is not None: - kwargs["vmax"] = vmax - - super(ImageOutput, self).__init__(node=node, name=name, **kwargs) - - # TODO: docstring? - def write(self, output, coordinates): - self.image = output.to_image(format=self.format, vmin=self.vmin, vmax=self.vmax, return_base64=True) diff --git a/podpac/core/pipeline/pipeline.py b/podpac/core/pipeline/pipeline.py deleted file mode 100644 index a44527e63..000000000 --- a/podpac/core/pipeline/pipeline.py +++ /dev/null @@ -1,295 +0,0 @@ -""" -Pipeline Summary -""" - -from __future__ import division, unicode_literals, print_function, absolute_import - -import inspect -import warnings -import importlib -from collections import OrderedDict -import json -from copy import deepcopy - -import traitlets as tl -import numpy as np - -from podpac.core.settings import settings -from podpac.core.utils import OrderedDictTrait, JSONEncoder -from podpac.core.node import Node, NodeException -from podpac.core.style import Style -from podpac.core.data.datasource import DataSource -from podpac.core.algorithm.algorithm import BaseAlgorithm -from podpac.core.compositor import Compositor - -from podpac.core.pipeline.output import Output, NoOutput, FileOutput, S3Output, FTPOutput, ImageOutput - - -class PipelineError(NodeException): - """ - Raised when parsing a Pipeline definition fails. - """ - - pass - - -class Pipeline(Node): - """Deprecated. See Node.definition and Node.from_definition.""" - - definition = OrderedDictTrait(help="pipeline definition") - json = tl.Unicode(help="JSON definition") - pipeline_output = tl.Instance(Output, help="pipeline output") - do_write_output = tl.Bool(True) - - def _first_init(self, path=None, **kwargs): - warnings.warn( - "Pipelines are deprecated and will be removed in podpac 2.0. See Node.definition and " - "Node.from_definition for Node serialization.", - DeprecationWarning, - ) - - if (path is not None) + ("definition" in kwargs) + ("json" in kwargs) != 1: - raise TypeError("Pipeline requires exactly one 'path', 'json', or 'definition' argument") - - if path is not None: - with open(path) as f: - kwargs["definition"] = json.load(f, object_pairs_hook=OrderedDict) - - return super(Pipeline, self)._first_init(**kwargs) - - @tl.validate("json") - def _json_validate(self, proposal): - s = proposal["value"] - definition = json.loads(s, object_pairs_hook=OrderedDict) - parse_pipeline_definition(definition) - return json.dumps( - json.loads(s, object_pairs_hook=OrderedDict), separators=(",", ":"), cls=JSONEncoder - ) # standardize - - @tl.validate("definition") - def _validate_definition(self, proposal): - definition = proposal["value"] - parse_pipeline_definition(definition) - return definition - - @tl.default("json") - def _json_from_definition(self): - return json.dumps(self.definition, separators=(",", ":"), cls=JSONEncoder) - - @tl.default("definition") - def _definition_from_json(self): - return json.loads(self.json, object_pairs_hook=OrderedDict) - - @tl.default("pipeline_output") - def _parse_definition(self): - return parse_pipeline_definition(self.definition) - - def eval(self, coordinates, output=None): - """Evaluate the pipeline, writing the output if one is defined. - - Parameters - ---------- - coordinates : TYPE - Description - """ - - self._requested_coordinates = coordinates - - output = self.pipeline_output.node.eval(coordinates, output) - if self.do_write_output: - self.pipeline_output.write(output, coordinates) - - self._output = output - return output - - # ----------------------------------------------------------------------------------------------------------------- - # properties, forwards output node - # ----------------------------------------------------------------------------------------------------------------- - - @property - def node(self): - return self.pipeline_output.node - - @property - def units(self): - return self.node.units - - @property - def dtype(self): - return self.node.dtype - - @property - def cache_ctrl(self): - return self.node.cache_ctrl - - @property - def style(self): - return self.node.style - - -# --------------------------------------------------------------------------------------------------------------------- -# Helper functions -# --------------------------------------------------------------------------------------------------------------------- - - -def parse_pipeline_definition(definition): - if "nodes" not in definition: - raise PipelineError("Pipeline definition requires 'nodes' property") - - if len(definition["nodes"]) == 0: - raise PipelineError("'nodes' property cannot be empty") - - # parse node definitions - nodes = OrderedDict() - for key, d in definition["nodes"].items(): - nodes[key] = _parse_node_definition(nodes, key, d) - - # parse output definition - output = _parse_output_definition(nodes, definition.get("pipeline_output", {})) - - return output - - -def _parse_node_definition(nodes, name, d): - # get node class - module_root = d.get("plugin", "podpac") - node_string = "%s.%s" % (module_root, d["node"]) - module_name, node_name = node_string.rsplit(".", 1) - try: - module = importlib.import_module(module_name) - except ImportError: - raise PipelineError("No module found '%s'" % module_name) - try: - node_class = getattr(module, node_name) - except AttributeError: - raise PipelineError("Node '%s' not found in module '%s'" % (node_name, module_name)) - - # parse and configure kwargs - kwargs = {} - whitelist = ["node", "attrs", "lookup_attrs", "plugin", "style"] - - # DataSource, Compositor, and Algorithm specific properties - parents = inspect.getmro(node_class) - - if DataSource in parents: - if "attrs" in d: - if "source" in d["attrs"]: - raise PipelineError("The 'source' property cannot be in attrs") - - if "lookup_source" in d["attrs"]: - raise PipelineError("The 'lookup_source' property cannot be in attrs") - - if "source" in d: - kwargs["source"] = d["source"] - whitelist.append("source") - - elif "lookup_source" in d: - kwargs["source"] = _get_subattr(nodes, name, d["lookup_source"]) - whitelist.append("lookup_source") - - if Compositor in parents: - if "sources" in d: - sources = [_get_subattr(nodes, name, source) for source in d["sources"]] - kwargs["sources"] = np.array(sources) - whitelist.append("sources") - - if DataSource in parents or Compositor in parents: - if "attrs" in d and "interpolation" in d["attrs"]: - raise PipelineError("The 'interpolation' property cannot be in attrs") - - if "interpolation" in d: - kwargs["interpolation"] = d["interpolation"] - whitelist.append("interpolation") - - if BaseAlgorithm in parents: - if "inputs" in d: - inputs = {k: _get_subattr(nodes, name, v) for k, v in d["inputs"].items()} - kwargs.update(inputs) - whitelist.append("inputs") - - for k, v in d.get("attrs", {}).items(): - kwargs[k] = v - - for k, v in d.get("lookup_attrs", {}).items(): - kwargs[k] = _get_subattr(nodes, name, v) - - if "style" in d: - kwargs["style"] = Style.from_definition(d["style"]) - - for key in d: - if key not in whitelist: - raise PipelineError("node '%s' has unexpected property %s" % (name, key)) - - # return node info - return node_class(**kwargs) - - -def _parse_output_definition(nodes, d): - # node (uses last node by default) - if "node" in d: - name = d["node"] - else: - name = list(nodes.keys())[-1] - - node = _get_subattr(nodes, "output", name) - - # output parameters - config = {k: v for k, v in d.items() if k not in ["node", "mode", "plugin", "output"]} - - # get output class from mode - if "plugin" not in d: - # core output (from mode) - mode = d.get("mode", "none") - if mode == "none": - output_class = NoOutput - elif mode == "file": - output_class = FileOutput - elif mode == "ftp": - output_class = FTPOutput - elif mode == "s3": - output_class = S3Output - elif mode == "image": - output_class = ImageOutput - else: - raise PipelineError("output has unexpected mode '%s'" % mode) - - output = output_class(node, name, **config) - - else: - # custom output (from plugin) - custom_output = "%s.%s" % (d["plugin"], d["output"]) - module_name, class_name = custom_output.rsplit(".", 1) - try: - module = importlib.import_module(module_name) - except ImportError: - raise PipelineError("No module found '%s'" % module_name) - try: - output_class = getattr(module, class_name) - except AttributeError: - raise PipelineError("Output '%s' not found in module '%s'" % (class_name, module_name)) - - try: - output = output_class(node, name, **config) - except Exception as e: - raise PipelineError("Could not create custom output '%s': %s" % (custom_output, e)) - - if not isinstance(output, Output): - raise PipelineError("Custom output '%s' must subclass 'podpac.core.pipeline.output.Output'" % custom_output) - - return output - - -def _get_subattr(nodes, name, ref): - refs = ref.split(".") - - try: - attr = nodes[refs[0]] - for _name in refs[1:]: - attr = getattr(attr, _name) - except (KeyError, AttributeError): - raise PipelineError("'%s' references nonexistent node/attribute '%s'" % (name, ref)) - - if settings["DEBUG"]: - attr = deepcopy(attr) - - return attr diff --git a/podpac/core/pipeline/test/test.json b/podpac/core/pipeline/test/test.json deleted file mode 100644 index 4202055f3..000000000 --- a/podpac/core/pipeline/test/test.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "nodes": { - "a": { - "node": "algorithm.Arange" - } - }, - "pipeline_output": { - "node": "a", - "mode": "file", - "format": "pickle", - "outdir": "." - } -} \ No newline at end of file diff --git a/podpac/core/pipeline/test/test_output.py b/podpac/core/pipeline/test/test_output.py deleted file mode 100644 index 42163ff89..000000000 --- a/podpac/core/pipeline/test/test_output.py +++ /dev/null @@ -1,66 +0,0 @@ -from __future__ import division, unicode_literals, print_function, absolute_import - -import os -import pytest - -import podpac -from podpac.core.algorithm.utility import Arange -from podpac.core.pipeline.output import FileOutput, FTPOutput, S3Output, NoOutput, ImageOutput - -coords = podpac.Coordinates([[0, 1, 2], [10, 20, 30]], dims=["lat", "lon"]) -node = Arange() -node_output = node.eval(coords) - - -class TestNoOutput(object): - def test(self): - output = NoOutput(node=node, name="test") - output.write(node_output, coords) - - -class TestFileOutput(object): - def _test(self, format): - output = FileOutput(node=node, name="test", outdir=".", format=format) - output.write(node_output, coords) - - assert output.path != None - assert os.path.isfile(output.path) - os.remove(output.path) - - def test_pickle(self): - self._test("pickle") - - def test_png(self): - # self._test('png') - - output = FileOutput(node=node, name="test", outdir=".", format="png") - with pytest.raises(NotImplementedError): - output.write(node_output, coords) - - def test_geotif(self): - # self._test('geotif') - - output = FileOutput(node=node, name="test", outdir=".", format="geotif") - with pytest.raises(NotImplementedError): - output.write(node_output, coords) - - -class TestFTPOutput(object): - def test(self): - output = FTPOutput(node=node, name="test", url="none", user="none") - with pytest.raises(NotImplementedError): - output.write(node_output, coords) - - -class TestS3Output(object): - def test(self): - output = S3Output(node=node, name="test", user="none", bucket="none") - with pytest.raises(NotImplementedError): - output.write(node_output, coords) - - -class TestImageOutput(object): - def test(self): - output = ImageOutput(node=node, name="test") - output.write(node_output, coords) - assert output.image is not None diff --git a/podpac/core/pipeline/test/test_parse_pipeline_definition.py b/podpac/core/pipeline/test/test_parse_pipeline_definition.py deleted file mode 100644 index b4cb11818..000000000 --- a/podpac/core/pipeline/test/test_parse_pipeline_definition.py +++ /dev/null @@ -1,848 +0,0 @@ -from __future__ import division, unicode_literals, print_function, absolute_import - -import json -import warnings -from collections import OrderedDict - -import numpy as np -import traitlets as tl -import pytest - -import podpac -from podpac.core.pipeline.pipeline import Pipeline, PipelineError, parse_pipeline_definition -from podpac.core.pipeline.output import NoOutput, FTPOutput, S3Output, FileOutput, ImageOutput - - -class TestParsePipelineDefinition(object): - def test_empty(self): - s = "{ }" - d = json.loads(s, object_pairs_hook=OrderedDict) - with pytest.raises(PipelineError, match="Pipeline definition requires 'nodes' property"): - parse_pipeline_definition(d) - - def test_no_nodes(self): - s = '{"nodes": { } }' - d = json.loads(s, object_pairs_hook=OrderedDict) - with pytest.raises(PipelineError, match="'nodes' property cannot be empty"): - parse_pipeline_definition(d) - - def test_invalid_node(self): - # module does not exist - s = '{"nodes": {"a": {"node": "nonexistent.Arbitrary"} } }' - d = json.loads(s, object_pairs_hook=OrderedDict) - with pytest.raises(PipelineError, match="No module found"): - parse_pipeline_definition(d) - - # node does not exist in module - s = '{"nodes": {"a": {"node": "core.Nonexistent"} } }' - d = json.loads(s, object_pairs_hook=OrderedDict) - with pytest.raises(PipelineError, match="Node 'Nonexistent' not found"): - parse_pipeline_definition(d) - - def test_datasource_source(self): - # basic - s = """ - { - "nodes": { - "mydata": { - "node": "data.DataSource", - "source": "my_data_string" - } - } - } - """ - - d = json.loads(s, object_pairs_hook=OrderedDict) - output = parse_pipeline_definition(d) - assert output.node.source == "my_data_string" - - # not required - s = """ - { - "nodes": { - "mydata": { - "node": "data.DataSource" - } - } - } - """ - - d = json.loads(s, object_pairs_hook=OrderedDict) - output = parse_pipeline_definition(d) - - # incorrect - s = """ - { - "nodes": { - "mydata": { - "node": "data.DataSource", - "attrs": { - "source": "my_data_string" - } - } - } - } - """ - - d = json.loads(s, object_pairs_hook=OrderedDict) - with pytest.raises(PipelineError, match="The 'source' property cannot be in attrs"): - parse_pipeline_definition(d) - - def test_datasource_lookup_source(self): - # sub-node - s = """ - { - "nodes": { - "mydata": { - "node": "data.DataSource", - "source": "my_data_string" - }, - "mydata2": { - "node": "data.DataSource", - "lookup_source": "mydata.source" - } - } - } - """ - - d = json.loads(s, object_pairs_hook=OrderedDict) - output = parse_pipeline_definition(d) - assert output.node.source == "my_data_string" - - # nonexistent node - s = """ - { - "nodes": { - "mydata": { - "node": "data.DataSource", - "source": "my_data_string" - }, - "mydata2": { - "node": "data.DataSource", - "lookup_source": "nonexistent.source" - } - } - } - """ - - d = json.loads(s, object_pairs_hook=OrderedDict) - with pytest.raises(PipelineError, match="'mydata2' references nonexistent node/attribute"): - parse_pipeline_definition(d) - - # nonexistent subattr - s = """ - { - "nodes": { - "mydata": { - "node": "data.DataSource", - "source": "my_data_string" - }, - "mydata2": { - "node": "data.DataSource", - "lookup_source": "mydata.nonexistent.source" - } - } - } - """ - - d = json.loads(s, object_pairs_hook=OrderedDict) - with pytest.raises(PipelineError, match="'mydata2' references nonexistent node/attribute"): - parse_pipeline_definition(d) - - # nonexistent subsubattr - s = """ - { - "nodes": { - "mydata": { - "node": "data.DataSource", - "source": "my_data_string" - }, - "mydata2": { - "node": "data.DataSource", - "lookup_source": "double.source.nonexistent" - } - } - } - """ - - d = json.loads(s, object_pairs_hook=OrderedDict) - with pytest.raises(PipelineError, match="'mydata2' references nonexistent node/attribute"): - parse_pipeline_definition(d) - - # in attrs (incorrect) - s = """ - { - "nodes": { - "mydata": { - "node": "data.DataSource", - "attrs": { - "lookup_source": "my_data_string" - } - } - } - } - """ - - d = json.loads(s, object_pairs_hook=OrderedDict) - with pytest.raises(PipelineError, match="The 'lookup_source' property cannot be in attrs"): - parse_pipeline_definition(d) - - def test_reprojected_source_lookup_source(self): - # source doesn't work - s = """ - { - "nodes": { - "mysource": { - "node": "data.DataSource", - "source": "my_data_string" - }, - "reprojected": { - "node": "data.ReprojectedSource", - "source": "mysource" - } - } - } - """ - - d = json.loads(s, object_pairs_hook=OrderedDict) - with pytest.raises(tl.TraitError): - parse_pipeline_definition(d) - - # lookup_source - s = """ - { - "nodes": { - "mysource": { - "node": "data.DataSource", - "source": "my_data_string" - }, - "reprojected": { - "node": "data.ReprojectedSource", - "lookup_source": "mysource" - } - } - } - """ - - d = json.loads(s, object_pairs_hook=OrderedDict) - output = parse_pipeline_definition(d) - assert output.node.source - assert output.node.source.source == "my_data_string" - - # lookup_source subattr - s = """ - { - "nodes": { - "mysource": { - "node": "data.DataSource", - "source": "my_data_string" - }, - "mean": { - "node": "algorithm.Mean", - "inputs": {"source": "mysource"} - }, - "reprojected": { - "node": "data.ReprojectedSource", - "lookup_source": "mean.source" - } - } - } - """ - - d = json.loads(s, object_pairs_hook=OrderedDict) - output = parse_pipeline_definition(d) - assert output.node.source - assert output.node.source.source == "my_data_string" - - # nonexistent node/attribute references are tested in test_datasource_lookup_source - - def test_array_source(self): - s = """ - { - "nodes": { - "mysource": { - "node": "data.Array", - "source": [0, 1, 2] - } - } - } - """ - - d = json.loads(s, object_pairs_hook=OrderedDict) - output = parse_pipeline_definition(d) - np.testing.assert_array_equal(output.node.source, [0, 1, 2]) - - def test_array_lookup_source(self): - # source doesn't work - s = """ - { - "nodes": { - "a": { - "node": "data.Array", - "source": [0, 1, 2] - }, - "b": { - "node": "data.Array", - "source": "a.source" - } - } - } - """ - - d = json.loads(s, object_pairs_hook=OrderedDict) - with pytest.raises(ValueError): - parse_pipeline_definition(d) - - # lookup_source does work - s = """ - { - "nodes": { - "a": { - "node": "data.Array", - "source": [0, 1, 2] - }, - "b": { - "node": "data.Array", - "lookup_source": "a.source" - } - } - } - """ - - d = json.loads(s, object_pairs_hook=OrderedDict) - output = parse_pipeline_definition(d) - np.testing.assert_array_equal(output.node.source, [0, 1, 2]) - - def test_algorithm_inputs(self): - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", "Insecure evaluation.*") - - # basic - s = """ - { - "nodes": { - "source1": {"node": "algorithm.Arange"}, - "source2": {"node": "algorithm.CoordData"}, - "result": { - "node": "algorithm.Arithmetic", - "inputs": { - "A": "source1", - "B": "source2" - }, - "attrs": { - "eqn": "A + B" - } - } - } - } - """ - - d = json.loads(s, object_pairs_hook=OrderedDict) - output = parse_pipeline_definition(d) - - assert isinstance(output.node.inputs["A"], podpac.algorithm.Arange) - assert isinstance(output.node.inputs["B"], podpac.algorithm.CoordData) - - # sub-node - s = """ - { - "nodes": { - "mysource": {"node": "algorithm.Arange"}, - "mean": { - "node": "algorithm.Mean", - "inputs": {"source": "mysource"} - }, - "double": { - "node": "algorithm.Arithmetic", - "inputs": { "A": "mean.source" }, - "attrs": { "eqn": "2 * A" } - } - } - } - """ - - d = json.loads(s, object_pairs_hook=OrderedDict) - output = parse_pipeline_definition(d) - - assert isinstance(output.node.inputs["A"], podpac.algorithm.Arange) - - # nonexistent node/attribute references are tested in test_datasource_lookup_source - - def test_compositor_sources(self): - # basic - s = """ - { - "nodes": { - "a": {"node": "algorithm.Arange"}, - "b": {"node": "algorithm.CoordData"}, - "c": { - "node": "compositor.OrderedCompositor", - "sources": ["a", "b"] - } - } - } - """ - - d = json.loads(s, object_pairs_hook=OrderedDict) - output = parse_pipeline_definition(d) - assert isinstance(output.node.sources[0], podpac.algorithm.Arange) - assert isinstance(output.node.sources[1], podpac.algorithm.CoordData) - - # sub-node - s = """ - { - "nodes": { - "source1": {"node": "algorithm.Arange"}, - "mean1": { - "node": "algorithm.Mean", - "inputs": {"source": "source1"} - }, - "c": { - "node": "compositor.OrderedCompositor", - "sources": ["mean1.source", "source1"] - } - } - } - """ - - d = json.loads(s, object_pairs_hook=OrderedDict) - output = parse_pipeline_definition(d) - assert isinstance(output.node, podpac.compositor.OrderedCompositor) - assert isinstance(output.node.sources[0], podpac.algorithm.Arange) - assert isinstance(output.node.sources[1], podpac.algorithm.Arange) - - # nonexistent node/attribute references are tested in test_datasource_lookup_source - - def test_datasource_interpolation(self): - s = """ - { - "nodes": { - "mydata": { - "node": "data.DataSource", - "source": "my_data_string", - "interpolation": "nearest" - } - } - } - """ - - d = json.loads(s, object_pairs_hook=OrderedDict) - output = parse_pipeline_definition(d) - assert output.node.interpolation == "nearest" - - # not required - s = """ - { - "nodes": { - "mydata": { - "node": "data.DataSource" - } - } - } - """ - - d = json.loads(s, object_pairs_hook=OrderedDict) - output = parse_pipeline_definition(d) - - # incorrect - s = """ - { - "nodes": { - "mydata": { - "node": "data.DataSource", - "attrs": { - "interpolation": "nearest" - } - } - } - } - """ - - d = json.loads(s, object_pairs_hook=OrderedDict) - with pytest.raises(PipelineError, match="The 'interpolation' property cannot be in attrs"): - parse_pipeline_definition(d) - - def test_compositor_interpolation(self): - s = """ - { - "nodes": { - "a": { - "node": "algorithm.Arange" - }, - "b": { - "node": "algorithm.Arange" - }, - "c": { - "node": "compositor.OrderedCompositor", - "sources": ["a", "b"], - "interpolation": "nearest" - } - } - } - """ - - d = json.loads(s, object_pairs_hook=OrderedDict) - output = parse_pipeline_definition(d) - assert output.node.interpolation == "nearest" - - def test_attrs(self): - import podpac.datalib.smap - - s = """ - { - "nodes": { - "sm": { - "node": "datalib.smap.SMAP", - "attrs": { - "product": "SPL4SMGP" - } - } - } - } - """ - - d = json.loads(s, object_pairs_hook=OrderedDict) - output = parse_pipeline_definition(d) - assert output.node.product == "SPL4SMGP" - - def test_lookup_attrs(self): - # attrs doesn't work - s = """ - { - "nodes": { - "a": { - "node": "algorithm.CoordData", - "attrs": { "coord_name": "lat" } - }, - "b": { - "node": "algorithm.CoordData", - "attrs": { "coord_name": "a.coord_name" } - } - } - } - """ - - d = json.loads(s, object_pairs_hook=OrderedDict) - output = parse_pipeline_definition(d) - with pytest.raises(AssertionError): - assert output.node.coord_name == "lat" - - # but lookup_attrs does - s = """ - { - "nodes": { - "a": { - "node": "algorithm.CoordData", - "attrs": { "coord_name": "lat" } - }, - "b": { - "node": "algorithm.CoordData", - "lookup_attrs": { "coord_name": "a.coord_name" } - } - } - } - """ - - d = json.loads(s, object_pairs_hook=OrderedDict) - output = parse_pipeline_definition(d) - assert output.node.coord_name == "lat" - - # NOTE: no nodes currently have a Node as an attr - # # lookup node directly (instead of a sub-attr) - # s = ''' - # { - # "nodes": { - # "mysource": { - # "node": "data.DataSource" - # }, - # "mynode": { - # "node": "MyNode", - # "lookup_attrs": { - # "my_node_attr": "mysource" - # } - # } - # } - # } - # ''' - - # d = json.loads(s, object_pairs_hook=OrderedDict) - # output = parse_pipeline_definition(d) - # assert isinstance(output.node.my_node_attr, DataSource) - - # nonexistent node/attribute references are tested in test_datasource_lookup_source - - def test_invalid_property(self): - s = """ - { - "nodes": { - "a": { - "node": "algorithm.Arange", - "invalid_property": "value" - } - } - } - """ - - d = json.loads(s, object_pairs_hook=OrderedDict) - with pytest.raises(PipelineError, match="node 'a' has unexpected property"): - parse_pipeline_definition(d) - - def test_plugin(self): - pass - - def test_parse_output_none(self): - s = """ - { - "nodes": {"a": {"node": "algorithm.Arange"} }, - "pipeline_output": {"node": "a", "mode": "none"} - } - """ - d = json.loads(s, object_pairs_hook=OrderedDict) - output = parse_pipeline_definition(d) - assert isinstance(output, NoOutput) - assert isinstance(output.node, podpac.algorithm.Arange) - assert output.name == "a" - - def test_parse_output_file(self): - s = """ - { - "nodes": {"a": {"node": "algorithm.Arange"} }, - "pipeline_output": { - "node": "a", - "mode": "file", - "format": "pickle", - "outdir": "my_directory" - } - } - """ - d = json.loads(s, object_pairs_hook=OrderedDict) - output = parse_pipeline_definition(d) - assert isinstance(output, FileOutput) - assert isinstance(output.node, podpac.algorithm.Arange) - assert output.name == "a" - assert output.format == "pickle" - assert output.outdir == "my_directory" - - def test_parse_output_s3(self): - s = """ - { - "nodes": {"a": {"node": "algorithm.Arange"} }, - "pipeline_output": { - "node": "a", - "mode": "s3", - "user": "my_user", - "bucket": "my_bucket" - } - } - """ - d = json.loads(s, object_pairs_hook=OrderedDict) - output = parse_pipeline_definition(d) - assert isinstance(output, S3Output) - assert isinstance(output.node, podpac.algorithm.Arange) - assert output.name == "a" - assert output.user == "my_user" - assert output.bucket == "my_bucket" - - def test_parse_output_ftp(self): - s = """ - { - "nodes": {"a": {"node": "algorithm.Arange"} }, - "pipeline_output": { - "node": "a", - "mode": "ftp", - "url": "my_url", - "user": "my_user" - } - } - """ - d = json.loads(s, object_pairs_hook=OrderedDict) - output = parse_pipeline_definition(d) - assert isinstance(output, FTPOutput) - assert isinstance(output.node, podpac.algorithm.Arange) - assert output.name == "a" - assert output.user == "my_user" - assert output.url == "my_url" - # TODO password - - def test_parse_output_image(self): - s = """ - { - "nodes": {"a": {"node": "algorithm.Arange"} }, - "pipeline_output": { - "node": "a", - "mode": "image" - } - } - """ - d = json.loads(s, object_pairs_hook=OrderedDict) - output = parse_pipeline_definition(d) - assert isinstance(output, ImageOutput) - assert isinstance(output.node, podpac.algorithm.Arange) - assert output.name == "a" - - def test_parse_output_invalid_mode(self): - # invalid mode - s = """ - { - "nodes": {"a": {"node": "algorithm.Arange"} }, - "pipeline_output": {"mode": "nonexistent_mode"} - } - """ - d = json.loads(s, object_pairs_hook=OrderedDict) - with pytest.raises(PipelineError, match="output has unexpected mode"): - parse_pipeline_definition(d) - - def test_parse_output_implicit_mode(self): - s = """ - { - "nodes": {"a": {"node": "algorithm.Arange"} }, - "pipeline_output": {"node": "a"} - } - """ - d = json.loads(s, object_pairs_hook=OrderedDict) - output = parse_pipeline_definition(d) - assert isinstance(output, NoOutput) - assert isinstance(output.node, podpac.algorithm.Arange) - assert output.name == "a" - - def test_parse_output_nonexistent_node(self): - s = """ - { - "nodes": {"a": {"node": "algorithm.Arange"} }, - "pipeline_output": { - "node": "b", - "mode": "file", - "format": "pickle", - "outdir": "my_directory" - } - } - """ - d = json.loads(s, object_pairs_hook=OrderedDict) - with pytest.raises(PipelineError, match="output' references nonexistent node"): - parse_pipeline_definition(d) - - def test_parse_output_implicit_node(self): - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", "Insecure evaluation.*") - - s = """ - { - "nodes": { - "source1": {"node": "algorithm.Arange"}, - "source2": {"node": "algorithm.Arange"}, - "result": { - "node": "algorithm.Arithmetic", - "inputs": { - "A": "source1", - "B": "source2" - }, - "attrs": { - "eqn": "A + B" - } - } - }, - "pipeline_output": { - "mode": "none" - } - } - """ - d = json.loads(s, object_pairs_hook=OrderedDict) - output = parse_pipeline_definition(d) - assert isinstance(output.node, podpac.algorithm.Arithmetic) - - def test_parse_output_implicit(self): - s = """ - { - "nodes": {"a": {"node": "algorithm.Arange"} } - } - """ - d = json.loads(s, object_pairs_hook=OrderedDict) - output = parse_pipeline_definition(d) - assert isinstance(output, NoOutput) - assert isinstance(output.node, podpac.algorithm.Arange) - assert output.name == "a" - - def test_parse_custom_output(self): - s = """ { - "nodes": {"a": {"node": "algorithm.Arange"} }, - "pipeline_output": { - "plugin": "podpac.core.pipeline.output", - "output": "ImageOutput" - } - } - """ - - d = json.loads(s, object_pairs_hook=OrderedDict) - output = parse_pipeline_definition(d) - assert isinstance(output, ImageOutput) - - s = """ { - "nodes": {"a": {"node": "algorithm.Arange"} }, - "pipeline_output": { - "plugin": "podpac", - "output": "core.pipeline.output.ImageOutput" - } - } - """ - - d = json.loads(s, object_pairs_hook=OrderedDict) - output = parse_pipeline_definition(d) - assert isinstance(output, ImageOutput) - - def test_parse_custom_output_invalid(self): - # no module - s = """ { - "nodes": {"a": {"node": "algorithm.Arange"} }, - "pipeline_output": { - "plugin": "nonexistent_module", - "output": "arbitrary" - } - } - """ - - d = json.loads(s, object_pairs_hook=OrderedDict) - with pytest.raises(PipelineError, match="No module found"): - parse_pipeline_definition(d) - - # module okay, but no such class - s = """ { - "nodes": {"a": {"node": "algorithm.Arange"} }, - "pipeline_output": { - "plugin": "podpac.core.pipeline.output", - "output": "Nonexistent" - } - } - """ - - d = json.loads(s, object_pairs_hook=OrderedDict) - with pytest.raises(PipelineError, match="Output 'Nonexistent' not found"): - parse_pipeline_definition(d) - - # module okay, class found, could not create - s = """ { - "nodes": {"a": {"node": "algorithm.Arange"} }, - "pipeline_output": { - "plugin": "numpy", - "output": "ndarray" - } - } - """ - - d = json.loads(s, object_pairs_hook=OrderedDict) - with pytest.raises(PipelineError, match="Could not create custom output"): - parse_pipeline_definition(d) - - # module okay, class found, incorrect type - s = """ { - "nodes": {"a": {"node": "algorithm.Arange"} }, - "pipeline_output": { - "plugin": "numpy", - "output": "array" - } - } - """ - - d = json.loads(s, object_pairs_hook=OrderedDict) - m = "Custom output '.*' must subclass 'podpac.core.pipeline.output.Output'" - with pytest.raises(PipelineError, match=m): - parse_pipeline_definition(d) diff --git a/podpac/core/pipeline/test/test_pipeline.py b/podpac/core/pipeline/test/test_pipeline.py deleted file mode 100644 index 195d35e85..000000000 --- a/podpac/core/pipeline/test/test_pipeline.py +++ /dev/null @@ -1,174 +0,0 @@ -from __future__ import division, unicode_literals, print_function, absolute_import - -import os -import json -from collections import OrderedDict -import warnings - -import numpy as np -import pytest - -import podpac -from podpac.core.algorithm.utility import Arange -from podpac.core.pipeline.pipeline import Pipeline, PipelineError -from podpac.core.pipeline.output import FileOutput - -coords = podpac.Coordinates([[0, 1, 2], [10, 20, 30]], dims=["lat", "lon"]) -node = Arange() -node.eval(coords) - - -class TestPipeline(object): - def test_init_path(self): - path = os.path.join(os.path.abspath(podpac.__path__[0]), "core", "pipeline", "test", "test.json") - with pytest.warns(DeprecationWarning): - pipeline = Pipeline(path=path) - - assert pipeline.json - assert pipeline.definition - assert pipeline.pipeline_output - - def test_init_json(self): - s = """ - { - "nodes": { - "a": { - "node": "algorithm.Arange" - } - } - } - """ - - with pytest.warns(DeprecationWarning): - pipeline = Pipeline(json=s) - assert pipeline.json - assert pipeline.definition - assert pipeline.pipeline_output - - def test_init_definition(self): - s = """ - { - "nodes": { - "a": { - "node": "algorithm.Arange" - } - } - } - """ - - d = json.loads(s, object_pairs_hook=OrderedDict) - - with pytest.warns(DeprecationWarning): - pipeline = Pipeline(definition=d) - assert pipeline.json - assert pipeline.definition - assert pipeline.pipeline_output - - def test_init_error(self): - pass - - def test_eval(self): - s = """ - { - "nodes": { - "a": { - "node": "algorithm.Arange" - } - } - } - """ - - with pytest.warns(DeprecationWarning): - pipeline = Pipeline(json=s) - pipeline.eval(coords) - - pipeline.units - pipeline.dtype - pipeline.cache_ctrl - pipeline.style - - def test_eval_output(self): - path = os.path.join(os.path.abspath(podpac.__path__[0]), "core", "pipeline", "test") - - s = """ - { - "nodes": { - "a": { - "node": "algorithm.Arange" - } - }, - "pipeline_output": { - "node": "a", - "mode": "file", - "format": "pickle", - "outdir": "." - } - } - """ - - with pytest.warns(DeprecationWarning): - pipeline = Pipeline(json=s) - pipeline.eval(coords) - assert pipeline.pipeline_output.path is not None - assert os.path.isfile(pipeline.pipeline_output.path) - os.remove(pipeline.pipeline_output.path) - - def test_eval_no_output(self): - path = os.path.join(os.path.abspath(podpac.__path__[0]), "core", "pipeline", "test") - - s = """ - { - "nodes": { - "a": { - "node": "algorithm.Arange" - } - }, - "pipeline_output": { - "node": "a", - "mode": "file", - "format": "pickle", - "outdir": "." - } - } - """ - - with pytest.warns(DeprecationWarning): - pipeline = Pipeline(json=s, do_write_output=False) - pipeline.eval(coords) - if pipeline.pipeline_output.path is not None and os.path.isfile(pipeline.pipeline_output.path): - os.remove(pipeline.pipeline_output.path) - assert pipeline.pipeline_output.path is None - - def test_debuggable(self): - s = """ - { - "nodes": { - "a": { - "node": "algorithm.Arange" - }, - "mean": { - "node": "algorithm.SpatialConvolution", - "inputs": {"source": "a"}, - "attrs": {"kernel_type": "mean,3"} - }, - "c": { - "node": "algorithm.Arithmetic", - "inputs": {"A": "a", "B": "mean"}, - "attrs": {"eqn": "a-b"} - } - } - } - """ - - with podpac.settings, warnings.catch_warnings(): - warnings.filterwarnings("ignore", "Insecure evaluation.*") - - podpac.core.settings.settings["DEBUG"] = False - with pytest.warns(DeprecationWarning): - pipeline = Pipeline(json=s) - assert pipeline.node.inputs["A"] is pipeline.node.inputs["B"].source - - podpac.core.settings.settings["DEBUG"] = True - with pytest.warns(DeprecationWarning): - pipeline = Pipeline(json=s) - assert pipeline.node.inputs["A"] is not pipeline.node.inputs["B"].source diff --git a/podpac/core/pipeline/test/test_pipeline_imports.py b/podpac/core/pipeline/test/test_pipeline_imports.py deleted file mode 100644 index d6534d6eb..000000000 --- a/podpac/core/pipeline/test/test_pipeline_imports.py +++ /dev/null @@ -1,3 +0,0 @@ -def test_import(): - from podpac.core.pipeline import Pipeline - from podpac.core.pipeline import PipelineError diff --git a/podpac/core/settings.py b/podpac/core/settings.py index ed378ce13..8fbd44353 100644 --- a/podpac/core/settings.py +++ b/podpac/core/settings.py @@ -20,10 +20,12 @@ DEFAULT_SETTINGS = { # podpac core settings "DEBUG": False, # This flag currently sets self._output on nodes - "ROOT_PATH": os.path.join(os.path.expanduser("~"), ".podpac"), + "ROOT_PATH": os.path.join(os.environ.get("XDG_CONFIG_HOME", os.path.expanduser("~")), ".config", "podpac"), "AUTOSAVE_SETTINGS": False, "LOG_TO_FILE": False, - "LOG_FILE_PATH": os.path.join(os.path.expanduser("~"), ".podpac", "logs", "podpac.log"), + "LOG_FILE_PATH": os.path.join( + os.environ.get("XDG_CONFIG_HOME", os.path.expanduser("~")), "podpac", "logs", "podpac.log" + ), "MULTITHREADING": False, "N_THREADS": 8, "CHUNK_SIZE": None, # Size of chunks for parallel processing or large arrays that do not fit in memory @@ -33,7 +35,8 @@ "UNSAFE_EVAL_HASH": uuid.uuid4().hex, # unique id for running unsafe evaluations # cache "DEFAULT_CACHE": ["ram"], - "CACHE_OUTPUT_DEFAULT": True, + "CACHE_DATASOURCE_OUTPUT_DEFAULT": True, + "CACHE_NODE_OUTPUT_DEFAULT": False, "RAM_CACHE_MAX_BYTES": 1e9, # ~1GB "DISK_CACHE_MAX_BYTES": 10e9, # ~10GB "S3_CACHE_MAX_BYTES": 10e9, # ~10GB @@ -65,17 +68,18 @@ class PodpacSettings(dict): Podpac settings are persistently stored in a ``settings.json`` file created at runtime. By default, podpac will create a settings json file in the users - home directory (``~/.podpac/settings.json``) when first run. + home directory (``~/.config/podpac/settings.json``), or $XDG_CONFIG_HOME/podpac/settings.json when first run. Default settings can be overridden or extended by: - * editing the ``settings.json`` file in the home directory (i.e. ``~/.podpac/settings.json``) + * editing the ``settings.json`` file in the settings directory (i.e. ``~/.podpac/settings.json`` or + ``$XDG_CONFIG_HOME/podpac/settings.json``) * creating a ``settings.json`` in the current working directory (i.e. ``./settings.json``) If ``settings.json`` files exist in multiple places, podpac will load settings in the following order, - overwriting previously loaded settings in the process: - * podpac settings defaults - * home directory settings (``~/.podpac/settings.json``) - * current working directory settings (``./settings.json``) + overwriting previously loaded settings in the process (i.e. highest numbered settings file prefered): + 1. podpac settings defaults + 2. settings directory (``~/.podpac/settings.json`` or ``$XDG_CONFIG_HOME/podpac/settings.json``) + 3. current working directory settings (``./settings.json``) :attr:`settings.settings_path` shows the path of the last loaded settings file (e.g. the active settings file). To persistently update the active settings file as changes are made at runtime, @@ -106,8 +110,10 @@ class PodpacSettings(dict): Notification email for when AWS usage reaches 80% of the `AWS_BUDGET_AMOUNT` DEFAULT_CACHE : list Defines a default list of cache stores in priority order. Defaults to `['ram']`. - CACHE_OUTPUT_DEFAULT : bool + CACHE_NODE_OUTPUT_DEFAULT : bool Default value for node ``cache_output`` trait. If True, the outputs of nodes (eval) will be automatically cached. + CACHE_DATASOURCE_OUTPUT_DEFAULT : bool + Default value for DataSource nodes ``cache_output`` trait. If True, the outputs of nodes (eval) will be automatically cached. RAM_CACHE_MAX_BYTES : int Maximum RAM cache size in bytes. Note, for RAM cache only, the limit is applied to the total amount of RAM used by the python process; @@ -124,7 +130,9 @@ class PodpacSettings(dict): Defaults to ``10e9`` (~10G). Set to `None` explicitly for no limit. DISK_CACHE_DIR : str - Subdirectory to use for the disk cache. Defaults to ``'cache'`` in the podpac root directory. + Subdirectory to use for the disk cache. Defaults to ``'cache'`` in the podpac root directory. + Use settings.cache_path to access this settings (this property looks for the environmental variable + `XDG_CACHE_HOME` to adjust the location of the cache directory) S3_CACHE_DIR : str Subdirectory to use for S3 cache (within the specified S3 bucket). Defaults to ``'cache'``. RAM_CACHE_ENABLED: bool @@ -263,6 +271,22 @@ def settings_path(self): """ return self._settings_filepath + @property + def cache_path(self): + """Path to the cache + + Returns + ------- + str + Path to where the cache is stored + """ + if os.path.isabs(settings["DISK_CACHE_DIR"]): + path = settings["DISK_CACHE_DIR"] + else: + path = os.path.join(os.environ.get("XDG_CACHE_HOME", settings["ROOT_PATH"]), settings["DISK_CACHE_DIR"]) + + return path + @property def defaults(self): """ diff --git a/podpac/core/test/test_authentication.py b/podpac/core/test/test_authentication.py index fe96f7bb4..1f8fd1902 100644 --- a/podpac/core/test/test_authentication.py +++ b/podpac/core/test/test_authentication.py @@ -1,39 +1,149 @@ -from __future__ import division, unicode_literals, print_function, absolute_import - import pytest -import sys -from io import StringIO +import requests +import traitlets as tl +import s3fs -import podpac.core.authentication as auth +from podpac import settings, Node +from podpac.core.authentication import RequestsSessionMixin, S3Mixin, set_credentials class TestAuthentication(object): - def test_earth_data_session_update(self): - eds = auth.EarthDataSession() - eds.update_login("testuser", "testpassword") - eds = auth.EarthDataSession() - assert eds.auth == ("testuser", "testpassword") - - def test_earth_data_session_update_input(self): - eds = auth.EarthDataSession() - auth.input = lambda x: "testuser2" - auth.getpass.getpass = lambda: "testpass2" - eds.update_login() - eds = auth.EarthDataSession() - assert eds.auth == ("testuser2", "testpass2") - - def test_earth_data_session_rebuild_auth(self): - eds = auth.EarthDataSession() - - class Dum(object): - pass - - prepared_request = Dum() - prepared_request.headers = {"Authorization": 0} - prepared_request.url = "https://example.com" - - response = Dum() - response.request = Dum() - response.request.url = "https://example2.com" - - eds.rebuild_auth(prepared_request, response) + def test_set_credentials(self): + + with settings: + if "username@test.com" in settings: + del settings["username@test.com"] + + if "password@test.com" in settings: + del settings["password@test.com"] + + # require hostname + with pytest.raises(TypeError): + set_credentials() + + with pytest.raises(ValueError): + set_credentials(None, username="test", password="test") + + with pytest.raises(ValueError): + set_credentials("", username="test", password="test") + + # make sure these are empty at first + assert not settings["username@test.com"] + assert not settings["password@test.com"] + + # test input/getpass + # TODO: how do you test this? + + # set both username and password + set_credentials(hostname="test.com", username="testuser", password="testpass") + assert settings["username@test.com"] == "testuser" + assert settings["password@test.com"] == "testpass" + + # set username only + set_credentials(hostname="test.com", username="testuser2") + assert settings["username@test.com"] == "testuser2" + assert settings["password@test.com"] == "testpass" + + # set password only + set_credentials(hostname="test.com", password="testpass3") + assert settings["username@test.com"] == "testuser2" + assert settings["password@test.com"] == "testpass3" + + # don't do anything if neither is provided, but the settings exist + set_credentials(hostname="test.com") + assert settings["username@test.com"] == "testuser2" + assert settings["password@test.com"] == "testpass3" + + +# dummy class mixing in RequestsSession with hostname +class SomeNodeWithHostname(RequestsSessionMixin): + hostname = "myurl.org" + + +class SomeNode(RequestsSessionMixin): + pass + + +class TestRequestsSessionMixin(object): + def test_hostname(self): + node = SomeNode(hostname="someurl.org") + assert node.hostname == "someurl.org" + + # use class that implements + node = SomeNodeWithHostname() + assert node.hostname == "myurl.org" + + def test_property_value_errors(self): + node = SomeNode(hostname="propertyerrors.com") + + with pytest.raises(ValueError, match="set_credentials"): + u = node.username + + with pytest.raises(ValueError, match="set_credentials"): + p = node.password + + def test_set_credentials(self): + with settings: + node = SomeNode(hostname="setcredentials.com") + node.set_credentials(username="testuser", password="testpass") + assert settings["username@setcredentials.com"] == "testuser" + assert settings["password@setcredentials.com"] == "testpass" + + def test_property_values(self): + with settings: + node = SomeNode(hostname="propertyvalues.com") + node.set_credentials(username="testuser2", password="testpass2") + + assert node.username == "testuser2" + assert node.password == "testpass2" + + def test_session(self): + with settings: + node = SomeNode(hostname="session.net") + node.set_credentials(username="testuser", password="testpass") + + assert node.session + assert node.session.auth == ("testuser", "testpass") + assert isinstance(node.session, requests.Session) + + def test_auth_required(self): + with settings: + with pytest.raises(tl.TraitError): + node = SomeNode(hostname="auth.com", auth_required="true") + + # no auth + node = SomeNode(hostname="auth.com") + assert node.session + assert isinstance(node.session, requests.Session) + with pytest.raises(AttributeError): + node.auth + + # auth required + if "username@auth2.com" in settings: + del settings["username@auth2.com"] + + if "password@auth2.com" in settings: + del settings["password@auth2.com"] + + node = SomeNode(hostname="auth2.com", auth_required=True) + with pytest.raises(ValueError): + s = node.session + print(s) + + node.set_credentials(username="testuser", password="testpass") + assert node.session + assert isinstance(node.session, requests.Session) + + +class TestS3Mixin(object): + class S3Node(S3Mixin, Node): + pass + + def test_anon(self): + node = self.S3Node(anon=True) + assert isinstance(node.s3, s3fs.S3FileSystem) + + @pytest.mark.aws + def test_auth(self): + node = self.S3Node() + assert isinstance(node.s3, s3fs.S3FileSystem) diff --git a/podpac/core/test/test_compositor.py b/podpac/core/test/test_compositor.py deleted file mode 100644 index 453a96d51..000000000 --- a/podpac/core/test/test_compositor.py +++ /dev/null @@ -1,400 +0,0 @@ -import warnings - -import pytest -import numpy as np - -import podpac -from podpac.core.data.datasource import DataSource -from podpac.core.data.array_source import Array -from podpac.compositor import Compositor, OrderedCompositor - -COORDS = podpac.Coordinates( - [podpac.clinspace(45, 0, 16), podpac.clinspace(-70, -65, 16), podpac.clinspace(0, 1, 2)], - dims=["lat", "lon", "time"], -) -LON, LAT, TIME = np.meshgrid(COORDS["lon"].coordinates, COORDS["lat"].coordinates, COORDS["time"].coordinates) - -ARRAY_LAT = Array(source=LAT.astype(float), native_coordinates=COORDS, interpolation="bilinear") -ARRAY_LON = Array(source=LON.astype(float), native_coordinates=COORDS, interpolation="bilinear") -ARRAY_TIME = Array(source=TIME.astype(float), native_coordinates=COORDS, interpolation="bilinear") - -MULTI_0_XY = Array(source=np.full(COORDS.shape + (2,), 0), native_coordinates=COORDS, outputs=["x", "y"]) -MULTI_1_XY = Array(source=np.full(COORDS.shape + (2,), 1), native_coordinates=COORDS, outputs=["x", "y"]) -MULTI_4_YX = Array(source=np.full(COORDS.shape + (2,), 4), native_coordinates=COORDS, outputs=["y", "x"]) -MULTI_2_X = Array(source=np.full(COORDS.shape + (1,), 2), native_coordinates=COORDS, outputs=["x"]) -MULTI_3_Z = Array(source=np.full(COORDS.shape + (1,), 3), native_coordinates=COORDS, outputs=["z"]) - - -class TestCompositor(object): - def test_init(self): - node = Compositor(sources=[ARRAY_LAT, ARRAY_LON, ARRAY_TIME]) - repr(node) - - def test_shared_coordinates(self): - node = Compositor(sources=[ARRAY_LAT, ARRAY_LON, ARRAY_TIME]) - - with pytest.raises(NotImplementedError): - node.get_shared_coordinates() - - with pytest.raises(NotImplementedError): - node.shared_coordinates() - - def test_source_coordinates(self): - # none (default) - node = Compositor(sources=[ARRAY_LAT, ARRAY_LON, ARRAY_TIME]) - assert node.source_coordinates is None - assert node.get_source_coordinates() is None - - # unstacked - node = podpac.compositor.Compositor( - sources=[podpac.algorithm.Arange(), podpac.algorithm.SinCoords()], - source_coordinates=podpac.Coordinates([[0, 1]], dims=["time"]), - ) - - # stacked - node = podpac.compositor.Compositor( - sources=[podpac.algorithm.Arange(), podpac.algorithm.SinCoords()], - source_coordinates=podpac.Coordinates([[[0, 1], [10, 20]]], dims=["time_alt"]), - ) - - # invalid size - with pytest.raises(ValueError, match="Invalid source_coordinates, source and source_coordinates size mismatch"): - node = podpac.compositor.Compositor( - sources=[podpac.algorithm.Arange(), podpac.algorithm.SinCoords()], - source_coordinates=podpac.Coordinates([[0, 1, 2]], dims=["time"]), - ) - - with pytest.raises(ValueError, match="Invalid source_coordinates, source and source_coordinates size mismatch"): - node = podpac.compositor.Compositor( - sources=[podpac.algorithm.Arange(), podpac.algorithm.SinCoords()], - source_coordinates=podpac.Coordinates([[0, 1, 2]], dims=["time"]), - ) - - # invalid ndims - with pytest.raises(ValueError, match="Invalid source_coordinates"): - node = podpac.compositor.Compositor( - sources=[podpac.algorithm.Arange(), podpac.algorithm.SinCoords()], - source_coordinates=podpac.Coordinates([[0, 1], [10, 20]], dims=["time", "alt"]), - ) - - def test_select_sources(self): - source_coords = podpac.Coordinates([[0, 10]], ["time"]) - node = podpac.compositor.Compositor( - sources=[podpac.algorithm.Arange(), podpac.algorithm.SinCoords()], source_coordinates=source_coords - ) - - selected = node.select_sources(source_coords) - assert len(selected) == 2 - assert selected[0] is node.sources[0] - assert selected[1] is node.sources[1] - - coords = podpac.Coordinates([podpac.clinspace(0, 1, 10), podpac.clinspace(0, 1, 11), 0], ["lat", "lon", "time"]) - selected = node.select_sources(coords) - assert len(selected) == 1 - assert selected[0] is node.sources[0] - - coords = podpac.Coordinates( - [podpac.clinspace(0, 1, 10), podpac.clinspace(0, 1, 11), 10], ["lat", "lon", "time"] - ) - selected = node.select_sources(coords) - assert len(selected) == 1 - assert selected[0] is node.sources[1] - - coords = podpac.Coordinates( - [podpac.clinspace(0, 1, 10), podpac.clinspace(0, 1, 11), 100], ["lat", "lon", "time"] - ) - selected = node.select_sources(coords) - assert len(selected) == 0 - - def test_iteroutputs_interpolation(self): - node = Compositor(sources=[ARRAY_LAT, ARRAY_LON, ARRAY_TIME], interpolation="nearest") - outputs = node.iteroutputs(COORDS) - for output in outputs: - pass - assert node.sources[0].interpolation == "nearest" - assert node.sources[1].interpolation == "nearest" - assert node.sources[2].interpolation == "nearest" - assert ARRAY_LAT.interpolation == "bilinear" - assert ARRAY_LON.interpolation == "bilinear" - assert ARRAY_TIME.interpolation == "bilinear" - - # if no interpolation is provided, keep the source interpolation values - node = Compositor(sources=[ARRAY_LAT, ARRAY_LON, ARRAY_TIME]) - outputs = node.iteroutputs(COORDS) - for output in outputs: - pass - assert node.sources[0].interpolation == "bilinear" - assert node.sources[1].interpolation == "bilinear" - assert node.sources[2].interpolation == "bilinear" - - def test_iteroutputs_empty(self): - node = Compositor(sources=[ARRAY_LAT, ARRAY_LON, ARRAY_TIME]) - outputs = node.iteroutputs(podpac.Coordinates([-1, -1, -1], dims=["lat", "lon", "time"])) - np.testing.assert_array_equal(next(outputs), [[[np.nan]]]) - np.testing.assert_array_equal(next(outputs), [[[np.nan]]]) - np.testing.assert_array_equal(next(outputs), [[[np.nan]]]) - with pytest.raises(StopIteration): - next(outputs) - - def test_iteroutputs_singlethreaded(self): - with podpac.settings: - podpac.settings["MULTITHREADING"] = False - - node = Compositor(sources=[ARRAY_LAT, ARRAY_LON, ARRAY_TIME]) - outputs = node.iteroutputs(COORDS) - np.testing.assert_array_equal(next(outputs), LAT) - np.testing.assert_array_equal(next(outputs), LON) - np.testing.assert_array_equal(next(outputs), TIME) - with pytest.raises(StopIteration): - next(outputs) - assert node._multi_threaded == False - - def test_iteroutputs_multithreaded(self): - with podpac.settings: - podpac.settings["MULTITHREADING"] = True - podpac.settings["N_THREADS"] = 8 - - n_threads_before = podpac.core.managers.multi_threading.thread_manager._n_threads_used - node = Compositor(sources=[ARRAY_LAT, ARRAY_LON, ARRAY_TIME]) - outputs = node.iteroutputs(COORDS) - np.testing.assert_array_equal(next(outputs), LAT) - np.testing.assert_array_equal(next(outputs), LON) - np.testing.assert_array_equal(next(outputs), TIME) - with pytest.raises(StopIteration): - next(outputs) - assert node._multi_threaded == True - assert podpac.core.managers.multi_threading.thread_manager._n_threads_used == n_threads_before - - def test_iteroutputs_n_threads_1(self): - with podpac.settings: - podpac.settings["MULTITHREADING"] = True - podpac.settings["N_THREADS"] = 1 - - n_threads_before = podpac.core.managers.multi_threading.thread_manager._n_threads_used - node = Compositor(sources=[ARRAY_LAT, ARRAY_LON, ARRAY_TIME]) - outputs = node.iteroutputs(COORDS) - np.testing.assert_array_equal(next(outputs), LAT) - np.testing.assert_array_equal(next(outputs), LON) - np.testing.assert_array_equal(next(outputs), TIME) - with pytest.raises(StopIteration): - next(outputs) - assert node._multi_threaded == False - assert podpac.core.managers.multi_threading.thread_manager._n_threads_used == n_threads_before - - def test_composite(self): - node = Compositor(sources=[ARRAY_LAT, ARRAY_LON, ARRAY_TIME]) - with pytest.raises(NotImplementedError): - node.composite(COORDS, iter(())) - - def test_eval(self): - node = Compositor(sources=[ARRAY_LAT, ARRAY_LON, ARRAY_TIME]) - with pytest.raises(NotImplementedError): - node.eval(COORDS) - - class MockComposite(Compositor): - def composite(self, coordinates, outputs, result=None): - return next(outputs) - - node = MockComposite(sources=[ARRAY_LAT, ARRAY_LON, ARRAY_TIME]) - output = node.eval(COORDS) - np.testing.assert_array_equal(output, LAT) - - def test_find_coordinates(self): - node = Compositor(sources=[ARRAY_LAT, ARRAY_LON, ARRAY_TIME]) - - with pytest.raises(NotImplementedError): - node.find_coordinates() - - def test_base_definition(self): - node = Compositor(sources=[ARRAY_LAT, ARRAY_LON, ARRAY_TIME]) - d = node.base_definition - assert isinstance(d, dict) - assert "sources" in d - assert "interpolation" in d - - def test_outputs(self): - # standard single-output - node = Compositor(sources=[ARRAY_LAT, ARRAY_LON, ARRAY_TIME]) - assert node.outputs is None - - # multi-output - node = Compositor(sources=[MULTI_0_XY, MULTI_1_XY]) - assert node.outputs == ["x", "y"] - - node = Compositor(sources=[MULTI_0_XY, MULTI_3_Z]) - assert node.outputs == ["x", "y", "z"] - - node = Compositor(sources=[MULTI_3_Z, MULTI_0_XY]) - assert node.outputs == ["z", "x", "y"] - - node = Compositor(sources=[MULTI_0_XY, MULTI_4_YX]) - assert node.outputs == ["x", "y"] - - # multi-output, with strict source outputs checking - node = Compositor(sources=[MULTI_0_XY, MULTI_1_XY], strict_source_outputs=True) - assert node.outputs == ["x", "y"] - - with pytest.raises(ValueError, match="Source outputs mismatch"): - node = Compositor(sources=[MULTI_0_XY, MULTI_2_X], strict_source_outputs=True) - - with pytest.raises(ValueError, match="Source outputs mismatch"): - node = Compositor(sources=[MULTI_0_XY, MULTI_3_Z], strict_source_outputs=True) - - with pytest.raises(ValueError, match="Source outputs mismatch"): - node = Compositor(sources=[MULTI_0_XY, MULTI_4_YX], strict_source_outputs=True) - - # mixed - with pytest.raises(ValueError, match="Cannot composite standard sources with multi-output sources."): - node = Compositor(sources=[MULTI_2_X, ARRAY_LAT]) - - -class TestOrderedCompositor(object): - def test_composite(self): - with podpac.settings: - podpac.settings["MULTITHREADING"] = False - - acoords = podpac.Coordinates([[0, 1], [10, 20, 30]], dims=["lat", "lon"]) - asource = np.ones(acoords.shape) - asource[0, :] = np.nan - a = Array(source=asource, native_coordinates=acoords) - - bcoords = podpac.Coordinates([[0, 1, 2], [10, 20, 30, 40]], dims=["lat", "lon"]) - bsource = np.zeros(bcoords.shape) - bsource[:, 0] = np.nan - b = Array(source=bsource, native_coordinates=bcoords) - - coords = podpac.Coordinates([[0, 1, 2], [10, 20, 30, 40, 50]], dims=["lat", "lon"]) - - node = OrderedCompositor(sources=[a, b], interpolation="bilinear") - expected = np.array( - [[np.nan, 0.0, 0.0, 0.0, np.nan], [1.0, 1.0, 1.0, 0.0, np.nan], [np.nan, 0.0, 0.0, 0.0, np.nan]] - ) - np.testing.assert_allclose(node.eval(coords), expected, equal_nan=True) - - node = OrderedCompositor(sources=[b, a], interpolation="bilinear") - expected = np.array( - [[np.nan, 0.0, 0.0, 0.0, np.nan], [1.0, 0.0, 0.0, 0.0, np.nan], [np.nan, 0.0, 0.0, 0.0, np.nan]] - ) - np.testing.assert_allclose(node.eval(coords), expected, equal_nan=True) - - def test_composite_multithreaded(self): - with podpac.settings: - podpac.settings["MULTITHREADING"] = True - podpac.settings["N_THREADS"] = 8 - - acoords = podpac.Coordinates([[0, 1], [10, 20, 30]], dims=["lat", "lon"]) - asource = np.ones(acoords.shape) - asource[0, :] = np.nan - a = Array(source=asource, native_coordinates=acoords) - - bcoords = podpac.Coordinates([[0, 1, 2], [10, 20, 30, 40]], dims=["lat", "lon"]) - bsource = np.zeros(bcoords.shape) - bsource[:, 0] = np.nan - b = Array(source=bsource, native_coordinates=bcoords) - - coords = podpac.Coordinates([[0, 1, 2], [10, 20, 30, 40, 50]], dims=["lat", "lon"]) - - node = OrderedCompositor(sources=[a, b], interpolation="bilinear") - expected = np.array( - [[np.nan, 0.0, 0.0, 0.0, np.nan], [1.0, 1.0, 1.0, 0.0, np.nan], [np.nan, 0.0, 0.0, 0.0, np.nan]] - ) - np.testing.assert_allclose(node.eval(coords), expected, equal_nan=True) - - node = OrderedCompositor(sources=[b, a], interpolation="bilinear") - expected = np.array( - [[np.nan, 0.0, 0.0, 0.0, np.nan], [1.0, 0.0, 0.0, 0.0, np.nan], [np.nan, 0.0, 0.0, 0.0, np.nan]] - ) - np.testing.assert_allclose(node.eval(coords), expected, equal_nan=True) - - def test_composite_short_circuit(self): - with podpac.settings: - podpac.settings["MULTITHREADING"] = False - podpac.settings["DEBUG"] = True - - coords = podpac.Coordinates([[0, 1], [10, 20, 30]], dims=["lat", "lon"]) - a = Array(source=np.ones(coords.shape), native_coordinates=coords) - b = Array(source=np.zeros(coords.shape), native_coordinates=coords) - node = OrderedCompositor(sources=[a, b], interpolation="bilinear") - output = node.eval(coords) - np.testing.assert_array_equal(output, a.source) - assert node.sources[0]._output is not None - assert node.sources[1]._output is None - - def test_composite_short_circuit_multithreaded(self): - with podpac.settings: - podpac.settings["MULTITHREADING"] = True - podpac.settings["N_THREADS"] = 8 - podpac.settings["DEBUG"] = True - - coords = podpac.Coordinates([[0, 1], [10, 20, 30]], dims=["lat", "lon"]) - n_threads_before = podpac.core.managers.multi_threading.thread_manager._n_threads_used - a = Array(source=np.ones(coords.shape), native_coordinates=coords) - b = Array(source=np.zeros(coords.shape), native_coordinates=coords) - node = OrderedCompositor(sources=[a, b], interpolation="bilinear") - output = node.eval(coords) - np.testing.assert_array_equal(output, a.source) - assert node._multi_threaded == True - assert podpac.core.managers.multi_threading.thread_manager._n_threads_used == n_threads_before - - def test_composite_into_result(self): - coords = podpac.Coordinates([[0, 1], [10, 20, 30]], dims=["lat", "lon"]) - a = Array(source=np.ones(coords.shape), native_coordinates=coords) - b = Array(source=np.zeros(coords.shape), native_coordinates=coords) - node = OrderedCompositor(sources=[a, b], interpolation="bilinear") - result = node.create_output_array(coords, data=np.random.random(coords.shape)) - output = node.eval(coords, output=result) - np.testing.assert_array_equal(output, a.source) - np.testing.assert_array_equal(result, a.source) - - def test_composite_multiple_outputs(self): - node = OrderedCompositor(sources=[MULTI_0_XY, MULTI_1_XY]) - output = node.eval(COORDS) - assert output.dims == ("lat", "lon", "time", "output") - np.testing.assert_array_equal(output["output"], ["x", "y"]) - np.testing.assert_array_equal(output.sel(output="x"), np.full(COORDS.shape, 0)) - np.testing.assert_array_equal(output.sel(output="y"), np.full(COORDS.shape, 0)) - - node = OrderedCompositor(sources=[MULTI_1_XY, MULTI_0_XY], strict_source_outputs=True) - output = node.eval(COORDS) - assert output.dims == ("lat", "lon", "time", "output") - np.testing.assert_array_equal(output["output"], ["x", "y"]) - np.testing.assert_array_equal(output.sel(output="x"), np.full(COORDS.shape, 1)) - np.testing.assert_array_equal(output.sel(output="y"), np.full(COORDS.shape, 1)) - - def test_composite_combine_multiple_outputs(self): - node = OrderedCompositor(sources=[MULTI_0_XY, MULTI_1_XY, MULTI_2_X, MULTI_3_Z]) - output = node.eval(COORDS) - assert output.dims == ("lat", "lon", "time", "output") - np.testing.assert_array_equal(output["output"], ["x", "y", "z"]) - np.testing.assert_array_equal(output.sel(output="x"), np.full(COORDS.shape, 0)) - np.testing.assert_array_equal(output.sel(output="y"), np.full(COORDS.shape, 0)) - np.testing.assert_array_equal(output.sel(output="z"), np.full(COORDS.shape, 3)) - - node = OrderedCompositor(sources=[MULTI_3_Z, MULTI_2_X, MULTI_0_XY, MULTI_1_XY]) - output = node.eval(COORDS) - assert output.dims == ("lat", "lon", "time", "output") - np.testing.assert_array_equal(output["output"], ["z", "x", "y"]) - np.testing.assert_array_equal(output.sel(output="x"), np.full(COORDS.shape, 2)) - np.testing.assert_array_equal(output.sel(output="y"), np.full(COORDS.shape, 0)) - np.testing.assert_array_equal(output.sel(output="z"), np.full(COORDS.shape, 3)) - - node = OrderedCompositor(sources=[MULTI_2_X, MULTI_4_YX]) - output = node.eval(COORDS) - assert output.dims == ("lat", "lon", "time", "output") - np.testing.assert_array_equal(output["output"], ["x", "y"]) - np.testing.assert_array_equal(output.sel(output="x"), np.full(COORDS.shape, 2)) - np.testing.assert_array_equal(output.sel(output="y"), np.full(COORDS.shape, 4)) - - def test_composite_stacked_unstacked(self): - anative = podpac.Coordinates([podpac.clinspace((0, 1), (1, 2), size=3)], dims=["lat_lon"]) - bnative = podpac.Coordinates([podpac.clinspace(-2, 3, 3), podpac.clinspace(-1, 4, 3)], dims=["lat", "lon"]) - a = Array(source=np.random.rand(3), native_coordinates=anative) - b = Array(source=np.random.rand(3, 3) + 2, native_coordinates=bnative) - - coords = podpac.Coordinates([podpac.clinspace(-3, 4, 32), podpac.clinspace(-2, 5, 32)], dims=["lat", "lon"]) - - node = OrderedCompositor(sources=np.array([a, b]), interpolation="nearest") - o = node.eval(coords) - # Check that both data sources are being used in the interpolation - assert np.any(o.data >= 2) - assert np.any(o.data <= 1) diff --git a/podpac/core/test/test_node.py b/podpac/core/test/test_node.py index 32678f27c..5603393c1 100644 --- a/podpac/core/test/test_node.py +++ b/podpac/core/test/test_node.py @@ -2,8 +2,8 @@ import os import json -import six import warnings +import tempfile from collections import OrderedDict from copy import deepcopy @@ -12,6 +12,7 @@ except: # Python 2.7 import urlparse as urllib +import six import pytest import numpy as np import xarray as xr @@ -23,27 +24,153 @@ import podpac from podpac.core import common_test_utils as ctu -from podpac.core.utils import ArrayTrait +from podpac.core.utils import ArrayTrait, NodeTrait from podpac.core.units import UnitsDataArray from podpac.core.style import Style -from podpac.core.cache import CacheCtrl, RamCacheStore -from podpac.core.node import Node, NodeException +from podpac.core.cache import CacheCtrl, RamCacheStore, DiskCacheStore +from podpac.core.node import Node, NodeException, NodeDefinitionError from podpac.core.node import node_eval +from podpac.core.node import NoCacheMixin, DiskCacheMixin class TestNode(object): - def test_eval_not_implemented(self): + def test_style(self): + node = Node() + assert isinstance(node.style, Style) + + def test_units(self): + node = Node(units="meters") + + with pytest.raises(UndefinedUnitError): + Node(units="abc") + + def test_outputs(self): + node = Node() + assert node.outputs is None + + node = Node(outputs=["a", "b"]) + assert node.outputs == ["a", "b"] + + def test_output(self): + node = Node() + assert node.output is None + + node = Node(outputs=["a", "b"]) + assert node.output is None + + node = Node(outputs=["a", "b"], output="b") + assert node.output == "b" + + # must be one of the outputs + with pytest.raises(ValueError, match="Invalid output"): + node = Node(outputs=["a", "b"], output="other") + + # only valid for multiple-output nodes + with pytest.raises(TypeError, match="Invalid output"): + node = Node(output="other") + + def test_cache_output(self): + with podpac.settings: + podpac.settings["CACHE_OUTPUT_DEFAULT"] = False + node = Node() + assert not node.cache_output + + podpac.settings["CACHE_OUTPUT_DEFAULT"] = True + node = Node() + assert node.cache_output + + def test_cache_ctrl(self): + # settings + with podpac.settings: + podpac.settings["DEFAULT_CACHE"] = ["ram"] + node = Node() + assert node.cache_ctrl is not None + assert len(node.cache_ctrl._cache_stores) == 1 + assert isinstance(node.cache_ctrl._cache_stores[0], RamCacheStore) + + podpac.settings["DEFAULT_CACHE"] = ["ram", "disk"] + node = Node() + assert node.cache_ctrl is not None + assert len(node.cache_ctrl._cache_stores) == 2 + assert isinstance(node.cache_ctrl._cache_stores[0], RamCacheStore) + assert isinstance(node.cache_ctrl._cache_stores[1], DiskCacheStore) + + # specify + node = Node(cache_ctrl=["ram"]) + assert node.cache_ctrl is not None + assert len(node.cache_ctrl._cache_stores) == 1 + assert isinstance(node.cache_ctrl._cache_stores[0], RamCacheStore) + + node = Node(cache_ctrl=["ram", "disk"]) + assert node.cache_ctrl is not None + assert len(node.cache_ctrl._cache_stores) == 2 + assert isinstance(node.cache_ctrl._cache_stores[0], RamCacheStore) + assert isinstance(node.cache_ctrl._cache_stores[1], DiskCacheStore) + + def test_tagged_attr_readonly(self): + class MyNode(Node): + my_attr = tl.Any().tag(attr=True) + + with podpac.settings: + podpac.settings["DEBUG"] = False + node = MyNode() + assert node.traits()["my_attr"].read_only + + podpac.settings["DEBUG"] = True + node = MyNode() + assert not node.traits()["my_attr"].read_only + + def test_trait_is_defined(self): + node = Node() + assert node.trait_is_defined("units") + + def test_init(self): + class MyNode(Node): + init_run = False + + def init(self): + super(MyNode, self).init() + self.init_run = True + + node = MyNode() + assert node.init_run + + def test_attrs(self): + class MyNode(Node): + my_attr = tl.Any().tag(attr=True) + my_trait = tl.Any() + + n = MyNode() + assert "my_attr" in n.attrs + assert "my_trait" not in n.attrs + + def test_repr(self): n = Node() - with pytest.raises(NotImplementedError): - n.eval(None) + repr(n) - with pytest.raises(NotImplementedError): - n.eval(None, output=None) + n = Node(outputs=["a", "b"]) + repr(n) + assert "outputs=" in repr(n) + assert "output=" not in repr(n) - def test_find_coordinates_not_implemented(self): + n = Node(outputs=["a", "b"], output="a") + repr(n) + assert "outputs=" not in repr(n) + assert "output=" in repr(n) + + def test_str(self): n = Node() - with pytest.raises(NotImplementedError): - n.find_coordinates() + str(n) + + n = Node(outputs=["a", "b"]) + str(n) + assert "outputs=" in str(n) + assert "output=" not in str(n) + + n = Node(outputs=["a", "b"], output="a") + str(n) + assert "outputs=" not in str(n) + assert "output=" in str(n) def test_eval_group(self): class MyNode(Node): @@ -54,8 +181,8 @@ def eval(self, coordinates, output=None): c2 = podpac.Coordinates([[10, 11], [10, 11, 12]], dims=["lat", "lon"]) g = podpac.coordinates.GroupCoordinates([c1, c2]) - n = MyNode() - outputs = n.eval_group(g) + node = MyNode() + outputs = node.eval_group(g) assert isinstance(outputs, list) assert len(outputs) == 2 assert isinstance(outputs[0], UnitsDataArray) @@ -65,69 +192,23 @@ def eval(self, coordinates, output=None): # invalid with pytest.raises(Exception): - n.eval_group(c1) + node.eval_group(c1) with pytest.raises(Exception): - n.eval(g) - - def test_units(self): - n = Node(units="meters") - - with pytest.raises(UndefinedUnitError): - Node(units="abc") - - def test_outputs(self): - n = Node() - assert n.outputs is None - - n = Node(outputs=["a", "b"]) - assert n.outputs == ["a", "b"] - - def test_outputs_and_output(self): - n = Node(outputs=["a", "b"]) - assert n.output is None + node.eval(g) - n = Node(outputs=["a", "b"], output="b") - assert n.output == "b" - - # must be one of the outputs - with pytest.raises(ValueError, match="Invalid output"): - n = Node(outputs=["a", "b"], output="other") - - # only valid for multiple-output nodes - with pytest.raises(TypeError, match="Invalid output"): - n = Node(output="other") - - -def TestNodeEval(self): - def test_extract_output(self): - coords = podpac.Coordinates([[0, 1, 2, 3], [0, 1]], dims=["lat", "lon"]) - - class MyNode1(Node): - @node_eval - def eval(self, coordinates, output=None): - return self.create_output_array(coordinates) - - # don't extract when no output field is requested - n = MyNode1() - out = n.eval(coords) - assert out.shape == (4, 2, 3) - - # do extract when an output field is requested - n = MyNode1(output="b") - out = n.eval(coords) - assert out.shape == (4, 2) + def test_eval_not_implemented(self): + node = Node() + with pytest.raises(NotImplementedError): + node.eval(None) - # should still work if the node has already extracted it - class MyNode2(Node): - @node_eval - def eval(self, coordinates, output=None): - out = self.create_output_array(coordinates) - return out.sel(output=self.output) + with pytest.raises(NotImplementedError): + node.eval(None, output=None) - n = MyNode2(output="b") - out = n.eval(coords) - assert out.shape == (4, 2) + def test_find_coordinates_not_implemented(self): + node = Node() + with pytest.raises(NotImplementedError): + node.find_coordinates() class TestCreateOutputArray(object): @@ -184,6 +265,41 @@ def test_create_output_array_crs(self): assert output.crs == crs +class TestNodeEval(object): + def test_extract_output(self): + coords = podpac.Coordinates([[0, 1, 2, 3], [0, 1]], dims=["lat", "lon"]) + + class MyNode1(Node): + outputs = ["a", "b", "c"] + + @node_eval + def eval(self, coordinates, output=None): + return self.create_output_array(coordinates) + + # don't extract when no output field is requested + node = MyNode1() + out = node.eval(coords) + assert out.shape == (4, 2, 3) + + # do extract when an output field is requested + node = MyNode1(output="b") + out = node.eval(coords) + assert out.shape == (4, 2) + + # should still work if the node has already extracted it + class MyNode2(Node): + outputs = ["a", "b", "c"] + + @node_eval + def eval(self, coordinates, output=None): + out = self.create_output_array(coordinates) + return out.sel(output=self.output) + + node = MyNode2(output="b") + out = node.eval(coords) + assert out.shape == (4, 2) + + class TestCaching(object): @classmethod def setup_class(cls): @@ -256,9 +372,10 @@ def test_put_overwrite(self): assert self.node.get_cache("test") == 0 with pytest.raises(NodeException): - self.node.put_cache(1, "test") + self.node.put_cache(1, "test", overwrite=False) + assert self.node.get_cache("test") == 0 - self.node.put_cache(1, "test", overwrite=True) + self.node.put_cache(1, "test") assert self.node.get_cache("test") == 1 def test_rem_all(self): @@ -328,249 +445,183 @@ def test_rem_key_coordinates(self): assert self.node.has_cache("c", coordinates=self.coords2) assert self.node.has_cache("d", coordinates=self.coords) - def test_cache_property_decorator(self): - class Test(podpac.Node): - a = tl.Int(1).tag(attr=True) - b = tl.Int(1).tag(attr=True) - c = tl.Int(1) - d = tl.Int(1) - - @podpac.core.node.cache_func("a2", "a") - def a2(self): - """a2 docstring""" - return self.a * 2 - - @podpac.core.node.cache_func("b2") - def b2(self): - """ b2 docstring """ - return self.b * 2 - - @podpac.core.node.cache_func("c2", "c") - def c2(self): - """ c2 docstring """ - return self.c * 2 - - @podpac.core.node.cache_func("d2") - def d2(self): - """ d2 docstring """ - return self.d * 2 - - t = Test(cache_ctrl=CacheCtrl([RamCacheStore()])) - t2 = Test(cache_ctrl=CacheCtrl([RamCacheStore()])) - t.rem_cache(key="*", coordinates="*") - t2.rem_cache(key="*", coordinates="*") + # node definition errors + # this demonstrates both classes of error in the has_cache case, but only one for put/get/rem + # we could test both classes for put/get/rem as well, but that is not really necessary + def test_has_cache_unavailable_circular(self): + class MyNode(Node): + a = tl.Any().tag(attr=True) - try: - t.get_cache("a2") - raise Exception("Cache should be cleared.") - except podpac.NodeException: - pass + @tl.default("a") + def _default_a(self): + return self.b - assert t.a2() == 2 - assert t.b2() == 2 - assert t.c2() == 2 - assert t.d2() == 2 - assert t2.a2() == 2 - assert t2.b2() == 2 - assert t2.c2() == 2 - assert t2.d2() == 2 - - t.set_trait("a", 2) - assert t.a2() == 4 - t.set_trait("b", 2) - assert t.b2() == 4 # This happens because the node definition changed - t.rem_cache(key="*", coordinates="*") - assert t.c2() == 2 # This forces the cache to update based on the new node definition - assert t.d2() == 2 # This forces the cache to update based on the new node definition - t.c = 2 - assert t.c2() == 4 # This happens because of depends - t.d = 2 - assert t.d2() == 2 # No depends, and doesn't have a tag - - # These should not change - assert t2.a2() == 2 - assert t2.b2() == 2 - assert t2.c2() == 2 - assert t2.d2() == 2 - - t2.set_trait("a", 2) - assert t2.get_cache("a2") == 4 # This was cached by t - t2.set_trait("b", 2) - assert t2.get_cache("c2") == 4 # This was cached by t - assert t2.get_cache("d2") == 2 # This was cached by t - - def test_cache_func_decorator_with_no_cache(self): - class Test(podpac.Node): - a = tl.Int(1).tag(attr=True) - b = tl.Int(1).tag(attr=True) - c = tl.Int(1) - d = tl.Int(1) - - @podpac.core.node.cache_func("a2", "a") - def a2(self): - """a2 docstring""" - return self.a * 2 - - @podpac.core.node.cache_func("b2") - def b2(self): - """ b2 docstring """ - return self.b * 2 - - @podpac.core.node.cache_func("c2", "c") - def c2(self): - """ c2 docstring """ - return self.c * 2 - - @podpac.core.node.cache_func("d2") - def d2(self): - """ d2 docstring """ - return self.d * 2 - - t = Test(cache_ctrl=None) - t2 = Test(cache_ctrl=None) - t.rem_cache(key="*", coordinates="*") - t2.rem_cache(key="*", coordinates="*") + @property + def b(self): + self.has_cache("b") + return 10 - try: - t.get_cache("a2") - raise Exception("Cache should be cleared.") - except podpac.NodeException: - pass + node = MyNode(cache_ctrl=["ram"]) + with pytest.raises(NodeException, match="Cache unavailable, node definition has a circular dependency"): + assert node.b == 10 + + def test_has_cache_unavailable_uninitialized(self): + class MyNode(Node): + a = tl.Any().tag(attr=True) + + @tl.validate("a") + def _validate_a(self, d): + self.b + return d["value"] + + @property + def b(self): + self.has_cache("key") + return 10 + + with pytest.raises(NodeException, match="Cache unavailable, node is not yet fully initialized"): + node = MyNode(a=3, cache_ctrl=["ram"]) + + def test_put_cache_unavailable_uninitialized(self): + class MyNode(Node): + a = tl.Any().tag(attr=True) + + @tl.validate("a") + def _validate_a(self, d): + self.b + return d["value"] + + @property + def b(self): + self.put_cache(10, "key") + return 10 + + with pytest.raises(NodeException, match="Cache unavailable"): + node = MyNode(a=3, cache_ctrl=["ram"]) + + def test_get_cache_unavailable_uninitialized(self): + class MyNode(Node): + a = tl.Any().tag(attr=True) - assert t.a2() == 2 - assert t.b2() == 2 - assert t.c2() == 2 - assert t.d2() == 2 - assert t2.a2() == 2 - assert t2.b2() == 2 - assert t2.c2() == 2 - assert t2.d2() == 2 - - t.set_trait("a", 2) - assert t.a2() == 4 - t.set_trait("b", 2) - assert t.b2() == 4 # This happens because the node definition changed - t.rem_cache(key="*", coordinates="*") - assert t.c2() == 2 # This forces the cache to update based on the new node definition - assert t.d2() == 2 # This forces the cache to update based on the new node definition - t.c = 2 - assert t.c2() == 4 # This happens because of depends - t.d = 2 - assert t.d2() == 4 # No caching here, so it SHOULD update - - # These should not change - assert t2.a2() == 2 - assert t2.b2() == 2 - assert t2.c2() == 2 - assert t2.d2() == 2 + @tl.validate("a") + def _validate_a(self, d): + self.b + return d["value"] + + @property + def b(self): + self.get_cache("key") + return 10 + + with pytest.raises(NodeException, match="Cache unavailable"): + node = MyNode(a=3, cache_ctrl=["ram"]) + + def test_rem_cache_unavailable_uninitialized(self): + class MyNode(Node): + a = tl.Any().tag(attr=True) + + @tl.validate("a") + def _validate_a(self, d): + self.b + return d["value"] + + @property + def b(self): + self.rem_cache("key") + return 10 + + with pytest.raises(NodeException, match="Cache unavailable"): + node = MyNode(a=3, cache_ctrl=["ram"]) class TestSerialization(object): @classmethod def setup_class(cls): a = podpac.algorithm.Arange() - b = podpac.data.Array(source=[10, 20, 30], native_coordinates=podpac.Coordinates([[0, 1, 2]], dims=["lat"])) - c = podpac.compositor.OrderedCompositor(sources=np.array([a, b])) + b = podpac.data.Array(source=[10, 20, 30], coordinates=podpac.Coordinates([[0, 1, 2]], dims=["lat"])) + c = podpac.compositor.OrderedCompositor(sources=[a, b]) with warnings.catch_warnings(): warnings.filterwarnings("ignore", "Insecure evaluation.*") cls.node = podpac.algorithm.Arithmetic(A=a, B=b, C=c, eqn="A + B + C") - cls.node_file_path = "node.json" - if os.path.exists(cls.node_file_path): - os.remove(cls.node_file_path) - - @classmethod - def teardown_class(cls): - if os.path.exists(cls.node_file_path): - os.remove(cls.node_file_path) - def test_base_ref(self): - n = Node() - assert isinstance(n.base_ref, str) + node = Node() + assert isinstance(node.base_ref, six.string_types) def test_base_definition(self): - class N(Node): + node = Node() + d = node._base_definition + assert "node" in d + assert isinstance(d["node"], six.string_types) + + def test_base_definition_attrs(self): + class MyNode(Node): my_attr = tl.Int().tag(attr=True) - my_node_attr = tl.Instance(Node).tag(attr=True) - a = Node() - node = N(my_attr=7, my_node_attr=a) + node = MyNode(my_attr=7) - d = node.base_definition - assert isinstance(d, OrderedDict) - assert "node" in d - assert isinstance(d["node"], str) - assert "attrs" in d - assert isinstance(d["attrs"], OrderedDict) - assert "my_attr" in d["attrs"] + d = node._base_definition assert d["attrs"]["my_attr"] == 7 - assert isinstance(d["lookup_attrs"], OrderedDict) - assert "my_node_attr" in d["lookup_attrs"] - assert d["lookup_attrs"]["my_node_attr"] is a - def test_base_definition_multiple_outputs(self): - n = Node() - d = n.base_definition - if "attrs" in d: - assert "outputs" not in d["attrs"] - assert "output" not in d["attrs"] + def test_base_definition_inputs(self): + class MyNode(Node): + my_attr = NodeTrait().tag(attr=True) - n = Node(outputs=["a", "b"]) - d = n.base_definition - assert "attrs" in d - assert "outputs" in d["attrs"] - assert "output" not in d["attrs"] + a = Node() + node = MyNode(my_attr=a) - n = Node(outputs=["a", "b"], output="b") - d = n.base_definition - assert "attrs" in d - assert "outputs" in d["attrs"] - assert "output" in d["attrs"] + d = node._base_definition + assert d["inputs"]["my_attr"] == a - def test_base_definition_units(self): - n = Node(units="meters") + def test_base_definition_inputs_array(self): + class MyNode(Node): + my_attr = ArrayTrait().tag(attr=True) - d = n.base_definition - assert "attrs" in d - assert isinstance(d["attrs"], OrderedDict) - assert "units" in d["attrs"] - assert d["attrs"]["units"] == "meters" + a = Node() + b = Node() + node = MyNode(my_attr=[a, b]) - n = Node() - d = n.base_definition - assert "units" not in d + d = node._base_definition + assert d["inputs"]["my_attr"][0] == a + assert d["inputs"]["my_attr"][1] == b - def test_base_definition_array_attr(self): - class N(Node): - my_attr = ArrayTrait().tag(attr=True) + def test_base_definition_inputs_dict(self): + class MyNode(Node): + my_attr = tl.Dict().tag(attr=True) - node = N(my_attr=np.ones((2, 3, 4))) - d = node.base_definition - my_attr = np.array(d["attrs"]["my_attr"]) - np.testing.assert_array_equal(my_attr, node.my_attr) + a = Node() + b = Node() + node = MyNode(my_attr={"a": a, "b": b}) - def test_base_definition_coordinates_attr(self): - class N(Node): - my_attr = tl.Instance(podpac.Coordinates).tag(attr=True) + d = node._base_definition + assert d["inputs"]["my_attr"]["a"] == a + assert d["inputs"]["my_attr"]["b"] == b - node = N(my_attr=podpac.Coordinates([[0, 1], [1, 2, 3]], dims=["lat", "lon"])) - d = node.base_definition - assert d["attrs"]["my_attr"] == node.my_attr + def test_base_definition_style(self): + node = Node(style=Style(name="test")) + d = node._base_definition + assert "style" in node._base_definition - def test_base_definition_unserializable(self): - class N(Node): - my_attr = tl.Instance(xr.DataArray).tag(attr=True) + def test_base_definition_remove_unnecessary_attrs(self): + node = Node(outputs=["a", "b"], output="a", units="m") + d = node._base_definition + assert "outputs" in d["attrs"] + assert "output" in d["attrs"] + assert "units" in d["attrs"] - node = N(my_attr=xr.DataArray([0, 1])) - with pytest.raises(NodeException, match="Cannot serialize attr 'my_attr'"): - node.base_definition + node = Node() + d = node._base_definition + if "attrs" in d: + assert "outputs" not in d["attrs"] + assert "output" not in d["attrs"] + assert "units" not in d["attrs"] def test_definition(self): # definition d = self.node.definition assert isinstance(d, OrderedDict) - assert len(d) == 4 + assert len(d) == 5 # from_definition with warnings.catch_warnings(): @@ -578,7 +629,7 @@ def test_definition(self): node = Node.from_definition(d) assert node is not self.node - assert node.hash == self.node.hash + assert node == self.node assert isinstance(node, podpac.algorithm.Arithmetic) assert isinstance(node.inputs["A"], podpac.algorithm.Arange) assert isinstance(node.inputs["B"], podpac.data.Array) @@ -588,49 +639,40 @@ def test_definition_duplicate_base_ref(self): n1 = Node(units="m") n2 = Node(units="ft") n3 = Node(units="in") - n = podpac.compositor.OrderedCompositor(sources=[n1, n2, n3]) - d = n.definition + node = podpac.compositor.OrderedCompositor(sources=[n1, n2, n3]) + d = node.definition assert n1.base_ref == n2.base_ref == n3.base_ref - assert len(d) == 4 - - def test_definition_lookup_attrs(self): - global MyNodeWithNodeAttr + assert len(d) == 5 - class MyNodeWithNodeAttr(Node): - my_node_attr = tl.Instance(Node).tag(attr=True) + def test_definition_inputs_array(self): + global MyNodeWithArrayInput - node = MyNodeWithNodeAttr(my_node_attr=podpac.algorithm.Arange()) - d = node.definition - assert isinstance(d, OrderedDict) - assert len(d) == 2 + class MyNodeWithArrayInput(Node): + my_array = ArrayTrait().tag(attr=True) - node2 = Node.from_definition(d) - assert node2 is not node - assert node2.hash == node.hash - assert isinstance(node2, MyNodeWithNodeAttr) - assert isinstance(node2.my_node_attr, podpac.algorithm.Arange) + node1 = MyNodeWithArrayInput(my_array=[podpac.algorithm.Arange()]) + node2 = Node.from_definition(node1.definition) + assert node2 is not node1 and node2 == node1 - def test_definition_lookup_source(self): - global MyNodeWithNodeSource + def test_definition_inputs_dict(self): + global MyNodeWithDictInput - class MyNodeWithNodeSource(podpac.data.DataSource): - source = tl.Instance(Node) + class MyNodeWithDictInput(Node): + my_dict = tl.Dict().tag(attr=True) - node = MyNodeWithNodeSource(source=podpac.algorithm.Arange()) - d = node.definition - assert isinstance(d, OrderedDict) - assert len(d) == 2 + node1 = MyNodeWithDictInput(my_dict={"a": podpac.algorithm.Arange()}) + node2 = Node.from_definition(node1.definition) + assert node2 is not node1 and node2 == node1 - node2 = Node.from_definition(d) - assert node2 is not node - assert node2.hash == node.hash - assert isinstance(node2, MyNodeWithNodeSource) - assert isinstance(node2.source, podpac.algorithm.Arange) + def test_definition_version(self): + d = self.node.definition + assert "podpac_version" in d + assert d["podpac_version"] == podpac.__version__ def test_json(self): # json s = self.node.json - assert isinstance(s, str) + assert isinstance(s, six.string_types) assert json.loads(s) # test from_json @@ -638,33 +680,35 @@ def test_json(self): warnings.filterwarnings("ignore", "Insecure evaluation.*") node = Node.from_json(s) assert node is not self.node - assert node.hash == self.node.hash + assert node == self.node assert isinstance(node, podpac.algorithm.Arithmetic) assert isinstance(node.inputs["A"], podpac.algorithm.Arange) assert isinstance(node.inputs["B"], podpac.data.Array) assert isinstance(node.inputs["C"], podpac.compositor.OrderedCompositor) def test_file(self): - # save - self.node.save(self.node_file_path) + path = tempfile.mkdtemp(prefix="podpac-test-") + filename = os.path.join(path, "node.json") - assert os.path.exists(self.node_file_path) + # save + self.node.save(filename) + assert os.path.exists(filename) with warnings.catch_warnings(): warnings.filterwarnings("ignore", "Insecure evaluation.*") - node = Node.load(self.node_file_path) + node = Node.load(filename) assert node is not self.node - assert node.hash == self.node.hash + assert node == self.node assert isinstance(node, podpac.algorithm.Arithmetic) assert isinstance(node.inputs["A"], podpac.algorithm.Arange) assert isinstance(node.inputs["B"], podpac.data.Array) assert isinstance(node.inputs["C"], podpac.compositor.OrderedCompositor) def test_json_pretty(self): - n = Node() - s = n.json_pretty - assert isinstance(s, str) + node = Node() + s = node.json_pretty + assert isinstance(s, six.string_types) json.loads(s) def test_hash(self): @@ -683,6 +727,88 @@ class M(Node): assert n1.hash != n3.hash assert n1.hash != m1.hash + def test_hash_preserves_definition(self): + n = Node() + d_before = deepcopy(n.definition) + h = n.hash + d_after = deepcopy(n.definition) + + assert d_before == d_after + + def test_hash_omit_style(self): + class N(Node): + my_attr = tl.Int().tag(attr=True) + + n1 = N(my_attr=1, style=Style(name="a")) + n2 = N(my_attr=1, style=Style(name="b")) + + # json has style in it + assert n1.json != n2.json + + # but hash does not + assert n1.hash == n2.hash + + def test_hash_omit_version(self): + version = podpac.__version__ + + try: + # actual version + n1 = Node() + s1 = n1.json + h1 = n1.hash + + # spoof different version + podpac.__version__ = "other" + n2 = Node() + s2 = n2.json + h2 = n2.hash + + # JSON should be different, but hash should be the same + assert s1 != s2 + assert h1 == h2 + + finally: + # reset version + podpac.__version__ = version + + def test_eq(self): + class N(Node): + my_attr = tl.Int().tag(attr=True) + + class M(Node): + my_attr = tl.Int().tag(attr=True) + + n1 = N(my_attr=1) + n2 = N(my_attr=1) + n3 = N(my_attr=2) + m1 = M(my_attr=1) + + # eq + assert n1 == n2 + assert not n1 == n3 + assert not n1 == m1 + assert not n1 == "other" + + # ne + assert not n1 != n2 + assert n1 != n3 + assert n1 != m1 + assert n1 != "other" + + def test_eq_ignore_style(self): + class N(Node): + my_attr = tl.Int().tag(attr=True) + + n1 = N(my_attr=1, style=Style(name="a")) + n2 = N(my_attr=1, style=Style(name="b")) + + # json has style in it + assert n1.json != n2.json + + # but == and != don't care + assert n1 == n2 + assert not n1 != n2 + def test_from_url(self): url = ( r"http://testwms/?map=map&&service={service}&request=GetMap&{layername}={layer}&styles=&format=image%2Fpng" @@ -704,16 +830,10 @@ def test_from_url(self): ): pipe = Node.from_url(url.format(service=service, layername=layername, layer=layer, params=param)) - def test_pipeline(self): - n = Node() - with pytest.warns(DeprecationWarning): - p = n.pipeline - assert isinstance(p, podpac.pipeline.Pipeline) - def test_style(self): node = podpac.data.Array( source=[10, 20, 30], - native_coordinates=podpac.Coordinates([[0, 1, 2]], dims=["lat"]), + coordinates=podpac.Coordinates([[0, 1, 2]], dims=["lat"]), style=Style(name="test", units="m"), ) @@ -729,10 +849,24 @@ def test_style(self): assert node2.style.units == "m" # default style - node = podpac.data.Array(source=[10, 20, 30], native_coordinates=podpac.Coordinates([[0, 1, 2]], dims=["lat"])) + node = podpac.data.Array(source=[10, 20, 30], coordinates=podpac.Coordinates([[0, 1, 2]], dims=["lat"])) d = node.definition assert "style" not in d[node.base_ref] + def test_circular_definition(self): + # this is admittedly a contrived example in order to demonstrate the most direct case + class MyNode(Node): + a = tl.Any().tag(attr=True) + + @tl.default("a") + def _default_a(self): + self.definition + return 10 + + node = MyNode() + with pytest.raises(NodeDefinitionError, match="node definition has a circular dependency"): + node.a + class TestUserDefinition(object): def test_empty(self): @@ -756,476 +890,69 @@ def test_invalid_node(self): with pytest.raises(ValueError, match="class 'Nonexistent' not found in module"): Node.from_json(s) - def test_datasource_source(self): - # basic - s = """ - { - "mydata": { - "node": "data.DataSource", - "source": "my_data_string" - } - } - """ - - node = Node.from_json(s) - assert isinstance(node, podpac.data.DataSource) - assert node.source == "my_data_string" - - # not required - s = """ - { - "mydata": { - "node": "data.DataSource" - } - } - """ - - node = Node.from_json(s) - assert isinstance(node, podpac.data.DataSource) - - # incorrect - s = """ - { - "mydata": { - "node": "data.DataSource", - "attrs": { - "source": "my_data_string" - } - } - } - """ - - with pytest.raises(ValueError, match="DataSource 'attrs' cannot have a 'source' property"): - node = Node.from_json(s) - - def test_datasource_lookup_source(self): - # sub-node - s = """ - { - "a": { - "node": "data.DataSource", - "source": "my_data_string" - }, - "b": { - "node": "data.DataSource", - "lookup_source": "a.source" - } - } - """ - - node = Node.from_json(s) - assert isinstance(node, podpac.data.DataSource) - assert node.source == "my_data_string" - - # nonexistent node - s = """ - { - "a": { - "node": "data.DataSource", - "source": "my_data_string" - }, - "b": { - "node": "data.DataSource", - "lookup_source": "nonexistent.source" - } - } - """ - - with pytest.raises(ValueError, match="reference to nonexistent node/attribute"): - Node.from_json(s) - - # nonexistent subattr - s = """ - { - "a": { - "node": "data.DataSource", - "source": "my_data_string" - }, - "b": { - "node": "data.DataSource", - "lookup_source": "a.nonexistent" - } - } - """ - - with pytest.raises(ValueError, match="reference to nonexistent node/attribute"): - Node.from_json(s) - - # nonexistent subsubattr + def test_inputs(self): + # invalid type s = """ { "a": { - "node": "data.DataSource", - "source": "my_data_string" - }, - "b": { - "node": "data.DataSource", - "lookup_source": "a.source.nonexistent" - } - } - """ - - with pytest.raises(ValueError, match="reference to nonexistent node/attribute"): - Node.from_json(s) - - # in attrs (incorrect) - s = """ - { - "mydata": { - "node": "data.DataSource", - "attrs": { - "lookup_source": "my_data_string" - } + "node": "algorithm.Min", + "inputs": { "source": 10 } } } """ - with pytest.raises(ValueError, match="DataSource 'attrs' cannot have a 'lookup_source' property"): + with pytest.raises(ValueError, match="Invalid definition for node"): Node.from_json(s) - def test_reprojected_source_lookup_source(self): - # NOTE: nonexistent node/attribute references are tested in test_datasource_lookup_source - - # lookup_source - s = """ - { - "mysource": { - "node": "data.DataSource", - "source": "my_data_string" - }, - "reprojected": { - "node": "data.ReprojectedSource", - "lookup_source": "mysource" - } - } - """ - - node = Node.from_json(s) - assert isinstance(node, podpac.data.ReprojectedSource) - assert isinstance(node.source, podpac.data.DataSource) - assert node.source.source == "my_data_string" - - # lookup_source subattr - s = """ - { - "mysource": { - "node": "data.DataSource", - "source": "my_data_string" - }, - "mean": { - "node": "algorithm.Mean", - "inputs": {"source": "mysource"} - }, - "reprojected": { - "node": "data.ReprojectedSource", - "lookup_source": "mean.source" - } - } - """ - - node = Node.from_json(s) - assert isinstance(node, podpac.data.ReprojectedSource) - assert isinstance(node.source, podpac.data.DataSource) - assert node.source.source == "my_data_string" - - # 'source' should fail - s = """ - { - "mysource": { - "node": "data.DataSource", - "source": "my_data_string" - }, - "reprojected": { - "node": "data.ReprojectedSource", - "source": "mysource" - } - } - """ - - with pytest.raises(tl.TraitError): - Node.from_json(s) - - def test_array_source(self): - s = """ - { - "mysource": { - "node": "data.Array", - "source": [0, 1, 2] - } - } - """ - - node = Node.from_json(s) - assert isinstance(node, podpac.data.Array) - np.testing.assert_array_equal(node.source, [0, 1, 2]) - - def test_array_lookup_source(self): - s = """ - { - "a": { - "node": "data.Array", - "source": [0, 1, 2] - }, - "b": { - "node": "data.Array", - "lookup_source": "a.source" - } - } - """ - - node = Node.from_json(s) - assert isinstance(node, podpac.data.Array) - np.testing.assert_array_equal(node.source, [0, 1, 2]) - - # 'source' should fail + # nonexistent node s = """ { "a": { - "node": "data.Array", - "source": [0, 1, 2] - }, - "b": { - "node": "data.Array", - "source": "a.source" + "node": "algorithm.Min", + "inputs": { "source": "nonexistent" } } } """ - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="Invalid definition for node"): Node.from_json(s) - def test_algorithm_inputs(self): - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", "Insecure evaluation.*") - # NOTE: nonexistent node/attribute references are tested in test_datasource_lookup_source - - # basic - s = """ - { - "source1": {"node": "algorithm.Arange"}, - "source2": {"node": "algorithm.CoordData"}, - "result": { - "node": "algorithm.Arithmetic", - "inputs": { - "A": "source1", - "B": "source2" - }, - "attrs": { - "eqn": "A + B" - } - } - } - """ - - node = Node.from_json(s) - assert isinstance(node, podpac.algorithm.Arithmetic) - assert isinstance(node.inputs["A"], podpac.algorithm.Arange) - assert isinstance(node.inputs["B"], podpac.algorithm.CoordData) - - # sub-node - s = """ - { - "mysource": {"node": "algorithm.Arange"}, - "mean": { - "node": "algorithm.Mean", - "inputs": { "source": "mysource" } - }, - "double": { - "node": "algorithm.Arithmetic", - "inputs": { "A": "mean.source" }, - "attrs": { "eqn": "2 * A" } - } - } - """ - - node = Node.from_json(s) - assert isinstance(node, podpac.algorithm.Arithmetic) - assert isinstance(node.inputs["A"], podpac.algorithm.Arange) - - # in attrs (incorrect) - s = """ - { - "source1": {"node": "algorithm.Arange"}, - "source2": {"node": "algorithm.CoordData"}, - "result": { - "node": "algorithm.Arithmetic", - "attrs": { - "inputs": { - "A": "source1", - "B": "source2" - }, - "eqn": "A + B" - } - } - } - """ - - with pytest.raises(ValueError, match="Algorithm 'attrs' cannot have an 'inputs' property"): - Node.from_json(s) - - def test_compositor_sources(self): - # NOTE: nonexistent node/attribute references are tested in test_datasource_lookup_source - - # basic - s = """ - { - "a": {"node": "algorithm.Arange"}, - "b": {"node": "algorithm.CoordData"}, - "c": { - "node": "compositor.OrderedCompositor", - "sources": ["a", "b"] - } - } - """ - - node = Node.from_json(s) - assert isinstance(node, podpac.compositor.OrderedCompositor) - assert isinstance(node.sources[0], podpac.algorithm.Arange) - assert isinstance(node.sources[1], podpac.algorithm.CoordData) - - # sub-node - s = """ - { - "source1": {"node": "algorithm.Arange"}, - "mean1": { - "node": "algorithm.Mean", - "inputs": {"source": "source1"} - }, - "c": { - "node": "compositor.OrderedCompositor", - "sources": ["mean1.source", "source1"] - } - } - """ - - node = Node.from_json(s) - assert isinstance(node, podpac.compositor.OrderedCompositor) - assert isinstance(node.sources[0], podpac.algorithm.Arange) - assert isinstance(node.sources[1], podpac.algorithm.Arange) - - def test_datasource_interpolation(self): - s = """ - { - "mydata": { - "node": "data.DataSource", - "source": "my_data_string", - "interpolation": "nearest" - } - } - """ - - node = Node.from_json(s) - assert isinstance(node, podpac.data.DataSource) - assert node.interpolation == "nearest" - - # not required - s = """ - { - "mydata": { - "node": "data.DataSource" - } - } - """ - - node = Node.from_json(s) - assert isinstance(node, podpac.data.DataSource) - - # incorrect - s = """ - { - "mydata": { - "node": "data.DataSource", - "attrs": { - "interpolation": "nearest" - } - } - } - """ - - with pytest.raises(ValueError, match="DataSource 'attrs' cannot have an 'interpolation' property"): - Node.from_json(s) - - def test_compositor_interpolation(self): - s = """ - { - "a": { - "node": "algorithm.Arange" - }, - "b": { - "node": "algorithm.Arange" - }, - "c": { - "node": "compositor.OrderedCompositor", - "sources": ["a", "b"], - "interpolation": "nearest" - } - } - """ - - node = Node.from_json(s) - assert isinstance(node, podpac.compositor.OrderedCompositor) - assert node.interpolation == "nearest" - - # not required + def test_lookup_attrs(self): s = """ { "a": { - "node": "algorithm.Arange" + "node": "algorithm.CoordData", + "attrs": { "coord_name": "lat" } }, "b": { - "node": "algorithm.Arange" - }, - "c": { - "node": "compositor.OrderedCompositor", - "sources": ["a", "b"] + "node": "algorithm.CoordData", + "lookup_attrs": { "coord_name": "a.coord_name" } } } """ node = Node.from_json(s) - assert isinstance(node, podpac.compositor.OrderedCompositor) + assert isinstance(node, podpac.algorithm.CoordData) + assert node.coord_name == "lat" - # incorrect + # invalid type s = """ { "a": { - "node": "algorithm.Arange" + "node": "algorithm.CoordData", + "attrs": { "coord_name": "lat" } }, "b": { - "node": "algorithm.Arange" - }, - "c": { - "node": "compositor.OrderedCompositor", - "sources": ["a", "b"], - "attrs": { - "interpolation": "nearest" - } + "node": "algorithm.CoordData", + "lookup_attrs": { "coord_name": 10 } } } """ - with pytest.raises(ValueError, match="Compositor 'attrs' cannot have an 'interpolation' property"): + with pytest.raises(ValueError, match="Invalid definition for node"): Node.from_json(s) - def test_attrs(self): - s = """ - { - "sm": { - "node": "datalib.smap.SMAP", - "attrs": { - "product": "SPL4SMGP" - } - } - } - """ - - node = Node.from_json(s) - assert isinstance(node, podpac.datalib.smap.SMAP) - assert node.product == "SPL4SMGP" - - def test_lookup_attrs(self): - # NOTE: nonexistent node/attribute references are tested in test_datasource_lookup_source - + # nonexistent node s = """ { "a": { @@ -1234,41 +961,15 @@ def test_lookup_attrs(self): }, "b": { "node": "algorithm.CoordData", - "lookup_attrs": { "coord_name": "a.coord_name" } + "lookup_attrs": { "coord_name": "nonexistent.coord_name" } } } """ - node = Node.from_json(s) - assert isinstance(node, podpac.algorithm.CoordData) - assert node.coord_name == "lat" - - # lookup node directly (instead of a sub-attr) - global MyNodeWithNodeAttr - - class MyNodeWithNodeAttr(Node): - my_node_attr = tl.Instance(Node).tag(attr=True) - - s = """ - { - "mysource": { - "node": "data.DataSource" - }, - "mynode": { - "plugin": "test_node", - "node": "MyNodeWithNodeAttr", - "lookup_attrs": { - "my_node_attr": "mysource" - } - } - } - """ - - node = Node.from_json(s) - assert isinstance(node, MyNodeWithNodeAttr) - assert isinstance(node.my_node_attr, podpac.data.DataSource) + with pytest.raises(ValueError, match="Invalid definition for node"): + Node.from_json(s) - # attrs should not work + # nonexistent subattr s = """ { "a": { @@ -1277,13 +978,13 @@ class MyNodeWithNodeAttr(Node): }, "b": { "node": "algorithm.CoordData", - "attrs": { "coord_name": "a.coord_name" } + "lookup_attrs": { "coord_name": "a.nonexistent" } } } """ - node = Node.from_json(s) - assert node.coord_name == "a.coord_name" # this will fail at evaluation + with pytest.raises(ValueError, match="Invalid definition for node"): + Node.from_json(s) def test_invalid_property(self): s = """ @@ -1337,18 +1038,18 @@ def test_debuggable(self): }, "mean": { "node": "algorithm.SpatialConvolution", - "inputs": {"source": "a"}, + "lookup_attrs": {"source": "a"}, "attrs": {"kernel_type": "mean,3"} }, "c": { "node": "algorithm.Arithmetic", - "inputs": {"A": "a", "B": "mean"}, + "lookup_attrs": {"A": "a", "B": "mean"}, "attrs": {"eqn": "a-b"} } } """ - with warnings.catch_warnings(): + with warnings.catch_warnings(), podpac.settings: warnings.filterwarnings("ignore", "Insecure evaluation.*") # normally node objects can and should be re-used @@ -1361,6 +1062,56 @@ def test_debuggable(self): node = Node.from_json(s) assert node.inputs["A"] is not node.inputs["B"].source + def test_from_definition_version_warning(self): + s = """ + { + "a": { + "node": "algorithm.Arange" + }, + "podpac_version": "other" + } + """ + + with pytest.warns(UserWarning, match="node definition version mismatch"): + node = Node.from_json(s) + + +class TestNoCacheMixin(object): + class NoCacheNode(NoCacheMixin, Node): + pass + + def test_default_no_cache(self): + with podpac.settings: + podpac.settings["DEFAULT_CACHE"] = ["ram"] + node = self.NoCacheNode() + assert len(node.cache_ctrl._cache_stores) == 0 + + def test_customizable(self): + podpac.settings["DEFAULT_CACHE"] = ["ram"] + node = self.NoCacheNode(cache_ctrl=["ram"]) + assert len(node.cache_ctrl._cache_stores) == 1 + + +class TestDiskCacheMixin(object): + class DiskCacheNode(DiskCacheMixin, Node): + pass + + def test_default_disk_cache(self): + with podpac.settings: + # add disk cache + podpac.settings["DEFAULT_CACHE"] = ["ram"] + node = self.DiskCacheNode() + assert len(node.cache_ctrl._cache_stores) == 2 + + # don't add if it is already there + podpac.settings["DEFAULT_CACHE"] = ["ram", "disk"] + node = self.DiskCacheNode() + assert len(node.cache_ctrl._cache_stores) == 2 + + def test_customizable(self): + node = self.DiskCacheNode(cache_ctrl=["ram"]) + assert len(node.cache_ctrl._cache_stores) == 1 + # TODO: remove this - this is currently a placeholder test until we actually have integration tests (pytest will exit with code 5 if no tests found) @pytest.mark.integration diff --git a/podpac/core/test/test_settings.py b/podpac/core/test/test_settings.py index faf445c29..cf5e818fa 100644 --- a/podpac/core/test/test_settings.py +++ b/podpac/core/test/test_settings.py @@ -27,8 +27,8 @@ def teardown_method(self): def test_settings_file_defaults_to_home_dir(self): self.make_settings_tmp_dir() # so teardown method has something ot tear down settings = PodpacSettings() - path = os.path.expanduser("~") - assert settings.settings_path == os.path.join(path, ".podpac", "settings.json") + path = os.environ.get("XDG_CACHE_HOME", os.path.expanduser("~")) + assert settings.settings_path == os.path.join(path, ".config", "podpac", "settings.json") def test_single_saved_setting_persists(self): path = self.make_settings_tmp_dir() @@ -73,5 +73,5 @@ def test_misconfigured_settings_file_fall_back_on_default(self): settings.load(path=path) assert isinstance(settings, dict) - path = os.path.expanduser("~") - assert settings.settings_path == os.path.join(path, ".podpac", "settings.json") + path = os.environ.get("XDG_CACHE_HOME", os.path.expanduser("~")) + assert settings.settings_path == os.path.join(path, ".config", "podpac", "settings.json") diff --git a/podpac/core/test/test_units.py b/podpac/core/test/test_units.py index a73f5e72e..81b2f57ca 100644 --- a/podpac/core/test/test_units.py +++ b/podpac/core/test/test_units.py @@ -1,6 +1,7 @@ from __future__ import division, unicode_literals, print_function, absolute_import import io +import tempfile import pytest import numpy as np @@ -13,7 +14,6 @@ from podpac.core.units import ureg from podpac.core.units import UnitsDataArray from podpac.core.units import to_image -from podpac.core.units import create_dataarray # DEPRECATED from podpac.data import Array, Rasterio @@ -445,10 +445,6 @@ def test_invalid_coords(self): with pytest.raises(TypeError): UnitsDataArray.create((3, 4)) - def test_deprecate_create_dataarray(self): - with pytest.deprecated_call(): - create_dataarray(self.coords, data=10) - class TestOpenDataArray(object): def test_open_after_create(self): @@ -482,8 +478,8 @@ def test_open_after_eval(self): lat = np.linspace(-10, 10, 5) lon = np.linspace(-10, 10, 5) native_coords = Coordinates([lat, lon], ["lat", "lon"]) - node = Array(source=data, native_coordinates=native_coords) - uda = node.eval(node.native_coordinates) + node = Array(source=data, coordinates=native_coords) + uda = node.eval(node.coordinates) ncdf = uda.to_netcdf() uda_2 = UnitsDataArray.open(ncdf) @@ -517,7 +513,7 @@ def make_square_array(self, order=1, bands=1): # bands = 3 node = Array( source=np.arange(8 * bands).reshape(3 - order, 3 + order, bands), - native_coordinates=Coordinates([clinspace(4, 0, 2, "lat"), clinspace(1, 4, 4, "lon")][::order]), + coordinates=Coordinates([clinspace(4, 0, 2, "lat"), clinspace(1, 4, 4, "lon")][::order]), outputs=[str(s) for s in list(range(bands))], ) return node @@ -531,7 +527,7 @@ def make_rot_array(self, order=1, bands=1): c = Coordinates([rc]) node = Array( source=np.arange(8 * bands).reshape(3 - order, 3 + order, bands), - native_coordinates=c, + coordinates=c, outputs=[str(s) for s in list(range(bands))], ) return node @@ -539,105 +535,105 @@ def make_rot_array(self, order=1, bands=1): def test_to_geotiff_rountrip_1band(self): # lat/lon order, usual node = self.make_square_array() - out = node.eval(node.native_coordinates) - fp = io.BytesIO() - out.to_geotiff(fp) - fp.write(b"a") # for some reason needed to get good comparison - fp.seek(0) - rnode = Rasterio(source=fp, outputs=node.outputs, mode="r") + out = node.eval(node.coordinates) + with tempfile.NamedTemporaryFile("wb") as fp: + out.to_geotiff(fp) + fp.write(b"a") # for some reason needed to get good comparison - assert node.native_coordinates == rnode.native_coordinates + fp.seek(0) + rnode = Rasterio(source=fp.name, outputs=node.outputs) + assert rnode.coordinates == node.coordinates - rout = rnode.eval(rnode.native_coordinates) - np.testing.assert_almost_equal(out.data, rout.data) + rout = rnode.eval(rnode.coordinates) + np.testing.assert_almost_equal(rout.data, out.data) - # lon/lat order, unsual + # lon/lat order, unusual node = self.make_square_array(order=-1) - out = node.eval(node.native_coordinates) - fp = io.BytesIO() - out.to_geotiff(fp) - fp.write(b"a") # for some reason needed to get good comparison - fp.seek(0) - rnode = Rasterio(source=fp, outputs=node.outputs) + out = node.eval(node.coordinates) + with tempfile.NamedTemporaryFile("wb") as fp: + out.to_geotiff(fp) + fp.write(b"a") # for some reason needed to get good comparison - assert node.native_coordinates == rnode.native_coordinates + fp.seek(0) + rnode = Rasterio(source=fp.name, outputs=node.outputs) + assert rnode.coordinates == node.coordinates - rout = rnode.eval(rnode.native_coordinates) - np.testing.assert_almost_equal(out.data, rout.data) + rout = rnode.eval(rnode.coordinates) + np.testing.assert_almost_equal(rout.data, out.data) def test_to_geotiff_rountrip_2band(self): # lat/lon order, usual node = self.make_square_array(bands=2) - out = node.eval(node.native_coordinates) - fp = io.BytesIO() - out.to_geotiff(fp) - fp.write(b"a") # for some reason needed to get good comparison - fp.seek(0) - rnode = Rasterio(source=fp, outputs=node.outputs, mode="r") + out = node.eval(node.coordinates) + with tempfile.NamedTemporaryFile("wb") as fp: + out.to_geotiff(fp) + fp.write(b"a") # for some reason needed to get good comparison - assert node.native_coordinates == rnode.native_coordinates + fp.seek(0) + rnode = Rasterio(source=fp.name, outputs=node.outputs) + assert rnode.coordinates == node.coordinates - rout = rnode.eval(rnode.native_coordinates) - np.testing.assert_almost_equal(out.data, rout.data) + rout = rnode.eval(rnode.coordinates) + np.testing.assert_almost_equal(rout.data, out.data) # lon/lat order, unsual node = self.make_square_array(order=-1, bands=2) - out = node.eval(node.native_coordinates) - fp = io.BytesIO() - out.to_geotiff(fp) - fp.write(b"a") # for some reason needed to get good comparison - fp.seek(0) - rnode = Rasterio(source=fp, outputs=node.outputs) - - assert node.native_coordinates == rnode.native_coordinates - - rout = rnode.eval(rnode.native_coordinates) - np.testing.assert_almost_equal(out.data, rout.data) - - # Check single output - fp.seek(0) - rnode = Rasterio(source=fp, outputs=node.outputs, output=node.outputs[1]) - rout = rnode.eval(rnode.native_coordinates) - np.testing.assert_almost_equal(out.data[..., 1], rout.data) - - # Check single band 1 - fp.seek(0) - rnode = Rasterio(source=fp, band=1) - rout = rnode.eval(rnode.native_coordinates) - np.testing.assert_almost_equal(out.data[..., 0], rout.data) - - # Check single band 2 - fp.seek(0) - rnode = Rasterio(source=fp, band=2) - rout = rnode.eval(rnode.native_coordinates) - np.testing.assert_almost_equal(out.data[..., 1], rout.data) + out = node.eval(node.coordinates) + with tempfile.NamedTemporaryFile("wb") as fp: + out.to_geotiff(fp) + fp.write(b"a") # for some reason needed to get good comparison + + fp.seek(0) + rnode = Rasterio(source=fp.name, outputs=node.outputs) + assert rnode.coordinates == node.coordinates + + rout = rnode.eval(rnode.coordinates) + np.testing.assert_almost_equal(rout.data, out.data) + + # Check single output + fp.seek(0) + rnode = Rasterio(source=fp.name, outputs=node.outputs, output=node.outputs[1]) + rout = rnode.eval(rnode.coordinates) + np.testing.assert_almost_equal(out.data[..., 1], rout.data) + + # Check single band 1 + fp.seek(0) + rnode = Rasterio(source=fp.name, band=1) + rout = rnode.eval(rnode.coordinates) + np.testing.assert_almost_equal(out.data[..., 0], rout.data) + + # Check single band 2 + fp.seek(0) + rnode = Rasterio(source=fp.name, band=2) + rout = rnode.eval(rnode.coordinates) + np.testing.assert_almost_equal(out.data[..., 1], rout.data) @pytest.mark.skip("TODO: We can remove this skipped test after solving #363") def test_to_geotiff_rountrip_rotcoords(self): # lat/lon order, usual node = self.make_rot_array() - out = node.eval(node.native_coordinates) - fp = io.BytesIO() - out.to_geotiff(fp) - fp.write(b"a") # for some reason needed to get good comparison - fp.seek(0) - rnode = Rasterio(source=fp, outputs=node.outputs, mode="r") + out = node.eval(node.coordinates) + with tempfile.NamedTemporaryFile("wb") as fp: + out.to_geotiff(fp) + fp.write(b"a") # for some reason needed to get good comparison - assert node.native_coordinates == rnode.native_coordinates + fp.seek(0) + rnode = Rasterio(source=fp.name, outputs=node.outputs, mode="r") + assert node.coordinates == rnode.coordinates - rout = rnode.eval(rnode.native_coordinates) - np.testing.assert_almost_equal(out.data, rout.data) + rout = rnode.eval(rnode.coordinates) + np.testing.assert_almost_equal(out.data, rout.data) # lon/lat order, unsual node = self.make_square_array(order=-1) - out = node.eval(node.native_coordinates) - fp = io.BytesIO() - out.to_geotiff(fp) - fp.write(b"a") # for some reason needed to get good comparison - fp.seek(0) - rnode = Rasterio(source=fp, outputs=node.outputs) + out = node.eval(node.coordinates) + with tempfile.NamedTemporaryFile("wb") as fp: + out.to_geotiff(fp) + fp.write(b"a") # for some reason needed to get good comparison - assert node.native_coordinates == rnode.native_coordinates + fp.seek(0) + rnode = Rasterio(source=fp.name, outputs=node.outputs) + assert node.coordinates == rnode.coordinates - rout = rnode.eval(rnode.native_coordinates) - np.testing.assert_almost_equal(out.data, rout.data) + rout = rnode.eval(rnode.coordinates) + np.testing.assert_almost_equal(out.data, rout.data) diff --git a/podpac/core/test/test_utils.py b/podpac/core/test/test_utils.py index 54a181875..686dfda00 100644 --- a/podpac/core/test/test_utils.py +++ b/podpac/core/test/test_utils.py @@ -1,33 +1,71 @@ from __future__ import division, unicode_literals, print_function, absolute_import import os +import sys +import json +import datetime +import warnings from collections import OrderedDict import pytest import numpy as np +import pandas as pd +import xarray as xr import traitlets as tl -import sys -import podpac.core.utils as ut + +import podpac +from podpac.core.utils import common_doc +from podpac.core.utils import trait_is_defined +from podpac.core.utils import create_logfile +from podpac.core.utils import OrderedDictTrait, ArrayTrait, TupleTrait, NodeTrait +from podpac.core.utils import JSONEncoder, is_json_serializable +from podpac.core.utils import cached_property +from podpac.core.utils import ind2slice class TestCommonDocs(object): def test_common_docs_does_not_affect_anonymous_functions(self): f = lambda x: x - f2 = ut.common_doc({"key": "value"})(f) + f2 = common_doc({"key": "value"})(f) assert f(42) == f2(42) assert f.__doc__ is None -# TODO: add log testing -class TestLog(object): - def test_create_log(self): - pass +class TestTraitletsHelpers(object): + def test_trait_is_defined(self): + class MyClass(tl.HasTraits): + a = tl.Any() + b = tl.Any(default_value=0) + c = tl.Any() + + @tl.default("c") + def _default_b(self): + return "test" + + x = MyClass(a=1, b=1, c=1) + assert trait_is_defined(x, "a") + assert trait_is_defined(x, "b") + assert trait_is_defined(x, "c") + assert not trait_is_defined(x, "other") + + x = MyClass() + assert trait_is_defined(x, "a") + assert trait_is_defined(x, "b") + assert not trait_is_defined(x, "c") + + x.c + assert trait_is_defined(x, "c") + + +class TestLoggingHelpers(object): + def test_create_logfile(self): + create_logfile() class TestOrderedDictTrait(object): def test(self): class MyClass(tl.HasTraits): - d = ut.OrderedDictTrait() + d = OrderedDictTrait() m = MyClass(d=OrderedDict([("a", 1)])) @@ -37,14 +75,14 @@ class MyClass(tl.HasTraits): @pytest.mark.skipif(sys.version < "3.6", reason="python < 3.6") def test_dict_python36(self): class MyClass(tl.HasTraits): - d = ut.OrderedDictTrait() + d = OrderedDictTrait() m = MyClass(d={"a": 1}) @pytest.mark.skipif(sys.version >= "3.6", reason="python >= 3.6") def test_dict_python2(self): class MyClass(tl.HasTraits): - d = ut.OrderedDictTrait() + d = OrderedDictTrait() with pytest.raises(tl.TraitError): m = MyClass(d={"a": 1}) @@ -56,7 +94,7 @@ class MyClass(tl.HasTraits): class TestArrayTrait(object): def test(self): class MyClass(tl.HasTraits): - a = ut.ArrayTrait() + a = ArrayTrait() # basic usage o = MyClass(a=np.array([0, 4])) @@ -68,14 +106,9 @@ class MyClass(tl.HasTraits): assert isinstance(o.a, np.ndarray) np.testing.assert_equal(o.a, [0, 4]) - # invalid - # As of numpy 0.16, no longer raises an error - # with pytest.raises(tl.TraitError): - # MyClass(a=[0, [4, 5]]) - def test_ndim(self): class MyClass(tl.HasTraits): - a = ut.ArrayTrait(ndim=2) + a = ArrayTrait(ndim=2) MyClass(a=np.array([[0, 4]])) MyClass(a=[[0, 4]]) @@ -86,7 +119,7 @@ class MyClass(tl.HasTraits): def test_shape(self): class MyClass(tl.HasTraits): - a = ut.ArrayTrait(shape=(2, 2)) + a = ArrayTrait(shape=(2, 2)) MyClass(a=np.array([[0, 1], [2, 3]])) MyClass(a=[[0, 1], [2, 3]]) @@ -97,7 +130,7 @@ class MyClass(tl.HasTraits): def test_dtype(self): class MyClass(tl.HasTraits): - a = ut.ArrayTrait(dtype=float) + a = ArrayTrait(dtype=float) m = MyClass(a=np.array([0.0, 1.0])) assert m.a.dtype == float @@ -115,15 +148,275 @@ class MyClass(tl.HasTraits): def test_args(self): # shape and ndim must match - t = ut.ArrayTrait(ndim=2, shape=(2, 2)) + t = ArrayTrait(ndim=2, shape=(2, 2)) with pytest.raises(ValueError): - ut.ArrayTrait(ndim=1, shape=(2, 2)) + ArrayTrait(ndim=1, shape=(2, 2)) # dtype lookup - t = ut.ArrayTrait(dtype="datetime64") + t = ArrayTrait(dtype="datetime64") assert t.dtype == np.datetime64 # invalid dtype with pytest.raises(ValueError): - ut.ArrayTrait(dtype="notatype") + ArrayTrait(dtype="notatype") + + +class TestNodeTrait(object): + def test(self): + class MyClass(tl.HasTraits): + node = NodeTrait() + + t = MyClass(node=podpac.Node()) + + with pytest.raises(tl.TraitError): + MyClass(node=0) + + def test_debug(self): + class MyClass(tl.HasTraits): + node = NodeTrait() + + node = podpac.Node() + + with podpac.settings: + podpac.settings["DEBUG"] = False + t = MyClass(node=node) + assert t.node is node + + podpac.settings["DEBUG"] = True + t = MyClass(node=node) + assert t.node is not node + + +class TestTupleTrait(object): + def test_trait(self): + class MyClass(tl.HasTraits): + t = TupleTrait(trait=tl.Int()) + + MyClass(t=(1, 2, 3)) + + with pytest.raises(tl.TraitError): + MyClass(t=("a", "b", "c")) + + def test_tuple(self): + class MyClass(tl.HasTraits): + t = TupleTrait(trait=tl.Int()) + + a = MyClass(t=(1, 2, 3)) + assert isinstance(a.t, tuple) + + a = MyClass(t=[1, 2, 3]) + assert isinstance(a.t, tuple) + + +class TestJSONEncoder(object): + def test_coordinates(self): + coordinates = podpac.coordinates.Coordinates([0], dims=["time"]) + json.dumps(coordinates, cls=JSONEncoder) + + def test_node(self): + node = podpac.Node() + json.dumps(node, cls=JSONEncoder) + + def test_style(self): + style = podpac.core.style.Style() + json.dumps(style, cls=JSONEncoder) + + def test_interpolation(self): + interpolation = podpac.data.Interpolation() + json.dumps(interpolation, cls=JSONEncoder) + + def test_interpolator(self): + kls = podpac.data.INTERPOLATORS[0] + json.dumps(kls, cls=JSONEncoder) + + def test_units(self): + units = podpac.core.units.ureg.Unit("meters") + json.dumps(units, cls=JSONEncoder) + + def test_datetime64(self): + dt = np.datetime64() + json.dumps(dt, cls=JSONEncoder) + + def test_timedelta64(self): + td = np.timedelta64() + json.dumps(td, cls=JSONEncoder) + + def test_datetime(self): + now = datetime.datetime.now() + json.dumps(now, cls=JSONEncoder) + + def test_date(self): + today = datetime.date.today() + json.dumps(today, cls=JSONEncoder) + + def test_dataframe(self): + df = pd.DataFrame() + json.dumps(df, cls=JSONEncoder) + + def test_array_datetime64(self): + a = np.array(["2018-01-01", "2018-01-02"]).astype(np.datetime64) + json.dumps(a, cls=JSONEncoder) + + def test_array_timedelta64(self): + a = np.array([np.timedelta64(1, "D"), np.timedelta64(1, "D")]) + json.dumps(a, cls=JSONEncoder) + + def test_array_numerical(self): + a = np.array([0.0, 1.0, 2.0]) + json.dumps(a, cls=JSONEncoder) + + def test_array_node(self): + a = np.array([podpac.Node(), podpac.Node()]) + json.dumps(a, cls=JSONEncoder) + + def test_array_unserializable(self): + class MyClass(object): + pass + + a = np.array([MyClass()]) + with pytest.raises(TypeError, match="Cannot serialize numpy array"): + json.dumps(a, cls=JSONEncoder) + + def test_unserializable(self): + value = xr.DataArray([]) + with pytest.raises(TypeError, match="not JSON serializable"): + json.dumps(value, cls=JSONEncoder) + + def test_is_json_serializable(self): + assert is_json_serializable("test") + assert not is_json_serializable(xr.DataArray([])) + + +class TestCachedPropertyDecorator(object): + def test_cached_property(self): + class MyNode(podpac.Node): + my_property_called = 0 + my_cached_property_called = 0 + my_cache_ctrl_property_called = 0 + + @property + def my_property(self): + self.my_property_called += 1 + return 10 + + @cached_property + def my_cached_property(self): + self.my_cached_property_called += 1 + return 20 + + @cached_property(use_cache_ctrl=True) + def my_cache_ctrl_property(self): + self.my_cache_ctrl_property_called += 1 + return 30 + + a = MyNode(cache_ctrl=["ram"]) + b = MyNode(cache_ctrl=["ram"]) + c = MyNode(cache_ctrl=[]) + + a.rem_cache(key="*") + b.rem_cache(key="*") + c.rem_cache(key="*") + + # normal property should be called every time + assert a.my_property_called == 0 + assert a.my_property == 10 + assert a.my_property_called == 1 + assert a.my_property == 10 + assert a.my_property == 10 + assert a.my_property_called == 3 + + assert b.my_property_called == 0 + assert b.my_property == 10 + assert b.my_property_called == 1 + assert b.my_property == 10 + assert b.my_property == 10 + assert b.my_property_called == 3 + + assert c.my_property_called == 0 + assert c.my_property == 10 + assert c.my_property_called == 1 + assert c.my_property == 10 + assert c.my_property == 10 + assert c.my_property_called == 3 + + # cached property should only be called when it is accessed + assert a.my_cached_property_called == 0 + assert a.my_cached_property == 20 + assert a.my_cached_property_called == 1 + assert a.my_cached_property == 20 + assert a.my_cached_property == 20 + assert a.my_cached_property_called == 1 + + assert b.my_cached_property_called == 0 + assert b.my_cached_property == 20 + assert b.my_cached_property_called == 1 + assert b.my_cached_property == 20 + assert b.my_cached_property == 20 + assert b.my_cached_property_called == 1 + + assert c.my_cached_property_called == 0 + assert c.my_cached_property == 20 + assert c.my_cached_property_called == 1 + assert c.my_cached_property == 20 + assert c.my_cached_property == 20 + assert c.my_cached_property_called == 1 + + # cache_ctrl cached property should only be called in the first node that accessses it + assert a.my_cache_ctrl_property_called == 0 + assert a.my_cache_ctrl_property == 30 + assert a.my_cache_ctrl_property_called == 1 + assert a.my_cache_ctrl_property == 30 + assert a.my_cache_ctrl_property == 30 + assert a.my_cache_ctrl_property_called == 1 + + assert b.my_cache_ctrl_property_called == 0 + assert b.my_cache_ctrl_property == 30 + assert b.my_cache_ctrl_property_called == 0 + assert b.my_cache_ctrl_property == 30 + assert b.my_cache_ctrl_property == 30 + assert b.my_cache_ctrl_property_called == 0 + + # but only if a cache_ctrl exists for the Node + assert c.my_cache_ctrl_property_called == 0 + assert c.my_cache_ctrl_property == 30 + assert c.my_cache_ctrl_property_called == 1 + assert c.my_cache_ctrl_property == 30 + assert c.my_cache_ctrl_property == 30 + assert c.my_cache_ctrl_property_called == 1 + + def test_invalid_argument(self): + with pytest.raises(TypeError, match="cached_property decorator does not accept keyword argument"): + cached_property(other=True) + + with pytest.raises(TypeError, match="cached_property decorator does not accept any positional arguments"): + cached_property(True) + + +class TestInd2Slice(object): + def test_slice(self): + assert ind2slice((slice(1, 4),)) == (slice(1, 4),) + + def test_integer(self): + assert ind2slice((1,)) == (1,) + + def test_integer_array(self): + assert ind2slice(([1, 2, 4],)) == (slice(1, 5),) + + def test_boolean_array(self): + assert ind2slice(([False, True, True, False, True, False],)) == (slice(1, 5),) + + def test_stepped(self): + assert ind2slice(([1, 3, 5],)) == (slice(1, 7, 2),) + assert ind2slice(([False, True, False, True, False, True],)) == (slice(1, 7, 2),) + + def test_multiindex(self): + I = (slice(1, 4), 1, [1, 2, 4], [False, True, False], [1, 3, 5]) + assert ind2slice(I) == (slice(1, 4), 1, slice(1, 5), 1, slice(1, 7, 2)) + + def test_nontuple(self): + assert ind2slice(slice(1, 4)) == slice(1, 4) + assert ind2slice(1) == 1 + assert ind2slice([1, 2, 4]) == slice(1, 5) + assert ind2slice([False, True, True, False, True, False]) == slice(1, 5) + assert ind2slice([1, 3, 5]) == slice(1, 7, 2) diff --git a/podpac/core/units.py b/podpac/core/units.py index 7f5ff3712..e2242d919 100644 --- a/podpac/core/units.py +++ b/podpac/core/units.py @@ -185,14 +185,27 @@ def to_format(self, format, *args, **kwargs): elif format in ["pickle", "pkl"]: r = cPickle.dumps(self) elif format == "zarr_part": - if part in kwargs: - part = [slice(*sss) for sss in kwargs.pop("part")] + from podpac.core.data.zarr_source import Zarr + import zarr + + if "part" in kwargs: + part = kwargs.pop("part") + part = tuple([slice(*sss) for sss in part]) else: part = slice(None) - zf = zarr.open(*args, **kwargs) - zf[part] = self.data + zn = Zarr(source=kwargs.pop("source")) + store = zn._get_store() + + zf = zarr.open(store, *args, **kwargs) + if "output" in self.dims: + for key in self.coords["output"].data: + zf[key][part] = self.sel(output=key).data + else: + data_key = kwargs.get("data_key", "data") + zf[data_key][part] = self.data + r = zn.source else: try: getattr(self, "to_" + format)(*args, **kwargs) @@ -353,6 +366,8 @@ def open(cls, *args, **kwargs): # pass in kwargs to constructor uda_kwargs = {"attrs": da.attrs} + if "output" in da.dims: + uda_kwargs.update({"outputs": da.coords["output"]}) return cls.create(coords, data=da.data, **uda_kwargs) @classmethod @@ -479,16 +494,6 @@ def func(self, *args, **kwargs): del func -def create_dataarray(coords, data=np.nan, dtype=float, outputs=None, **kwargs): - """Deprecated. Use `UnitsDataArray.create()` in place. - """ - warnings.warn( - "The `create_dataarray` function is deprecated and will be removed in podpac 2.0. Use the classmethod `UnitsDataArray.create()` instead.", - DeprecationWarning, - ) - return UnitsDataArray.create(coords, data=data, outputs=outputs, dtype=dtype, **kwargs) - - def to_image(data, format="png", vmin=None, vmax=None, return_base64=False): """Return a base64-encoded image of data @@ -516,7 +521,9 @@ def to_image(data, format="png", vmin=None, vmax=None, return_base64=False): import matplotlib.cm from matplotlib.image import imsave - matplotlib.use("agg") + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + matplotlib.use("agg") if format != "png": raise ValueError("Invalid image format '%s', must be 'png'" % format) diff --git a/podpac/core/utils.py b/podpac/core/utils.py index bb0929ed3..a4031d4bc 100644 --- a/podpac/core/utils.py +++ b/podpac/core/utils.py @@ -10,19 +10,20 @@ import datetime import functools import importlib -from collections import OrderedDict import logging +from collections import OrderedDict from copy import deepcopy -from six import string_types -import lazy_import try: import urllib.parse as urllib except: # Python 2.7 import urlparse as urllib +from six import string_types +import lazy_import import traitlets as tl import numpy as np +import xarray as xr import pandas as pd # Core dependency of xarray # Optional Imports @@ -54,7 +55,7 @@ def _decorator(func): return _decorator -def trait_is_defined(obj, trait): +def trait_is_defined(obj, trait_name): """Utility method to determine if trait is defined on object without call to default (@tl.default) @@ -62,7 +63,7 @@ def trait_is_defined(obj, trait): ---------- object : object Class with traits - trait : str + trait_name : str Class property to investigate Returns @@ -71,7 +72,7 @@ def trait_is_defined(obj, trait): True if the trait exists on the object and is defined False if the trait does not exist on the object or the trait is not defined """ - return obj.has_trait(trait) and trait in obj._trait_values + return obj.has_trait(trait_name) and trait_name in obj._trait_values def create_logfile( @@ -165,13 +166,7 @@ def __init__(self, ndim=None, shape=None, dtype=None, dtypes=None, *args, **kwar def validate(self, obj, value): # coerce type if not isinstance(value, np.ndarray): - try: - value = np.array(value) - except: - raise tl.TraitError( - "The '%s' trait of an %s instance must be an np.ndarray, but a value of %s %s was specified" - % (self.name, obj.__class__.__name__, value, type(value)) - ) + value = np.array(value) # ndim if self.ndim is not None and self.ndim != value.ndim: @@ -208,9 +203,11 @@ def validate(self, obj, value): return tuple(value) -class NodeTrait(tl.ForwardDeclaredInstance): +class NodeTrait(tl.Instance): def __init__(self, *args, **kwargs): - super(NodeTrait, self).__init__("Node", *args, **kwargs) + from podpac import Node as _Node + + super(NodeTrait, self).__init__(_Node, *args, **kwargs) def validate(self, obj, value): super(NodeTrait, self).validate(obj, value) @@ -221,61 +218,50 @@ def validate(self, obj, value): class JSONEncoder(json.JSONEncoder): def default(self, obj): - # podpac Coordinates objects - if isinstance(obj, podpac.Coordinates): - return obj.definition - - # podpac Node objects - elif isinstance(obj, podpac.Node): - return obj.definition - - # podpac Style objects - elif isinstance(obj, podpac.core.style.Style): + # podpac objects with definitions + if isinstance(obj, (podpac.Coordinates, podpac.Node, podpac.data.Interpolation, podpac.core.style.Style)): return obj.definition - elif isinstance(obj, podpac.data.Interpolation): - return obj.definition + # podpac Interpolator type + if isinstance(obj, type) and obj in podpac.data.INTERPOLATORS: + return obj().definition # pint Units - elif isinstance(obj, podpac.core.units.ureg.Unit): + if isinstance(obj, podpac.core.units.ureg.Unit): return str(obj) - # numpy arrays - elif isinstance(obj, np.ndarray): - if np.issubdtype(obj.dtype, np.datetime64): - return obj.astype(str).tolist() - elif np.issubdtype(obj.dtype, np.timedelta64): - f = np.vectorize(podpac.core.coordinates.utils.make_timedelta_string) - return f(obj).tolist() - elif np.issubdtype(obj.dtype, np.number): - return obj.tolist() - # datetime64 - elif isinstance(obj, np.datetime64): + if isinstance(obj, np.datetime64): return obj.astype(str) # timedelta64 - elif isinstance(obj, np.timedelta64): + if isinstance(obj, np.timedelta64): return podpac.core.coordinates.utils.make_timedelta_string(obj) # datetime - elif isinstance(obj, datetime.datetime): + if isinstance(obj, (datetime.datetime, datetime.date)): return obj.isoformat() # dataframe - elif isinstance(obj, pd.DataFrame): + if isinstance(obj, pd.DataFrame): return obj.to_json() - # Interpolator - try: - if obj in podpac.core.data.interpolation.INTERPOLATORS: - interpolater_class = deepcopy(obj) - interpolator = interpolater_class() - return interpolator.definition - except Exception as e: - pass - - # default + # numpy array + if isinstance(obj, np.ndarray): + if np.issubdtype(obj.dtype, np.datetime64): + return obj.astype(str).tolist() + if np.issubdtype(obj.dtype, np.timedelta64): + return [podpac.core.coordinates.utils.make_timedelta_string(e) for e in obj] + if np.issubdtype(obj.dtype, np.number): + return obj.tolist() + else: + try: + # completely serialize the individual elements using the custom encoder + return json.loads(json.dumps([e for e in obj], cls=JSONEncoder)) + except TypeError as e: + raise TypeError("Cannot serialize numpy array\n%s" % e) + + # raise the TypeError return json.JSONEncoder.default(self, obj) @@ -306,18 +292,28 @@ def _get_query_params_from_url(url): return params -def _get_from_url(url): +def _get_from_url(url, session=None): """Helper function to get data from an url with error checking. - + Parameters - ----------- - auth_session: podpac.core.authentication.EarthDataSession - Authenticated EDS session - url: str + ---------- + url : str URL to website + session : :class:`requests.Session`, optional + Requests session to use when making the GET request to `url` + + Returns + ------- + str + Text response from request. + See https://2.python-requests.org/en/master/api/#requests.Response.text """ try: - r = requests.get(url) + if session is None: + r = requests.get(url) + else: + r = session.get(url) + if r.status_code != 200: _log.warning( "Could not connect to {}, status code {}. \n *** Return Text *** \n {} \n *** End Return Text ***".format( @@ -330,4 +326,125 @@ def _get_from_url(url): r = None except RuntimeError as e: _log.warning("Cannot authenticate to {}. Check credentials. Error was as follows:".format(url) + str(e)) - return r.text + + return r + + +def cached_property(*args, **kwargs): + """ + Decorator that creates a property that is cached. + + Keyword Arguments + ----------------- + use_cache_ctrl : bool + If True, the property is cached using the Node cache_ctrl. If False, the property is only cached as a private + attribute. Default False. + + Notes + ----- + Podpac caching using the cache_ctrl will be unreliable if the property depends on any non-tagged traits. + The property should only use node attrs (traits tagged with ``attr=True``). + + Examples + -------- + + >>> class MyNode(Node): + # property that is recomputed every time + @property + def my_property(self): + return 0 + + # property is computed once for each object + @cached_property + def my_cached_property(self): + return 1 + + # property that is computed once and can be reused by other Nodes or sessions, depending on the cache_ctrl + @cached_property(use_cache_ctrl=True) + def my_persistent_cached_property(self): + return 2 + """ + + use_cache_ctrl = kwargs.pop("use_cache_ctrl", False) + + if args and (len(args) != 1 or not callable(args[0])): + raise TypeError("cached_property decorator does not accept any positional arguments") + + if kwargs: + raise TypeError("cached_property decorator does not accept keyword argument '%s'" % list(kwargs.keys())[0]) + + def d(fn): + key = "_podpac_cached_property_%s" % fn.__name__ + + @property + def wrapper(self): + if hasattr(self, key): + value = getattr(self, key) + elif use_cache_ctrl and self.has_cache(key): + value = self.get_cache(key) + setattr(self, key, value) + else: + value = fn(self) + setattr(self, key, value) + if use_cache_ctrl: + self.put_cache(value, key) + return value + + return wrapper + + if args: + return d(args[0]) + else: + return d + + +def ind2slice(Is): + """ Convert boolean and integer index arrays to slices. + + Integer and boolean arrays are converted to slices that span the selected elements, but may include additional + elements. If possible, the slices are stepped. + + Arguments + --------- + Is : tuple + tuple of indices (slice, integer array, boolean array, or single integer) + + Returns + ------- + Js : tuple + tuple of slices + """ + + if isinstance(Is, tuple): + return tuple(_ind2slice(I) for I in Is) + else: + return _ind2slice(Is) + + +def _ind2slice(I): + # already a slice + if isinstance(I, slice): + return I + + # convert to numpy array + I = np.atleast_1d(I) + + # convert boolean array to index array + if I.dtype == bool: + (I,) = np.where(I) + + # empty slice + if I.size == 0: + return slice(0, 0) + + # singleton + if I.size == 1: + return I[0] + + # stepped slice + diff = np.diff(I) + if diff.size and np.all(diff == diff[0]) and diff[0] != 0: + return slice(I.min(), I.max() + diff[0], diff[0]) + + # non-stepped slice + return slice(I.min(), I.max() + 1) diff --git a/podpac/data.py b/podpac/data.py index eec916743..df55d1270 100644 --- a/podpac/data.py +++ b/podpac/data.py @@ -5,18 +5,23 @@ # REMINDER: update api docs (doc/source/user/api.rst) to reflect changes to this file from podpac.core.data.datasource import DataSource -from podpac.core.data.interpolation import ( +from podpac.core.data.array_source import Array +from podpac.core.data.pydap_source import PyDAP +from podpac.core.data.rasterio_source import Rasterio +from podpac.core.data.h5py_source import H5PY +from podpac.core.data.csv_source import CSV +from podpac.core.data.dataset_source import Dataset +from podpac.core.data.zarr_source import Zarr +from podpac.core.data.ogc import WCS +from podpac.core.data.reprojection import ReprojectedSource + +from podpac.core.interpolation.interpolation import ( Interpolation, InterpolationException, - interpolation_trait, + InterpolationTrait, INTERPOLATION_DEFAULT, INTERPOLATORS, INTERPOLATION_METHODS, INTERPOLATORS_DICT, INTERPOLATION_METHODS_DICT, ) -from podpac.core.data.array_source import Array -from podpac.core.data.pydap_source import PyDAP -from podpac.core.data.file import Rasterio, H5PY, CSV, Dataset, Zarr -from podpac.core.data.ogc import WCS -from podpac.core.data.reprojection import ReprojectedSource diff --git a/podpac/datalib/__init__.py b/podpac/datalib/__init__.py index 31faf1075..7fa282dbd 100644 --- a/podpac/datalib/__init__.py +++ b/podpac/datalib/__init__.py @@ -25,4 +25,4 @@ # intake requires python >= 3.6 if sys.version >= "3.6": - from podpac.datalib.intake import IntakeCatalog + from podpac.datalib.intake_catalog import IntakeCatalog diff --git a/podpac/datalib/airmoss.py b/podpac/datalib/airmoss.py deleted file mode 100644 index 651f9f771..000000000 --- a/podpac/datalib/airmoss.py +++ /dev/null @@ -1,217 +0,0 @@ -""" -Airmoss summary -""" - -from __future__ import division, unicode_literals, print_function, absolute_import - -import re -from collections import OrderedDict - -import requests -from bs4 import BeautifulSoup -import numpy as np -import traitlets as tl - -# Internal dependencies -import podpac -from podpac.core.data import types as datatype - - -class AirMOSS_Source(datatype.PyDAP): - """Summary - - Attributes - ---------- - datakey : TYPE - Description - date_url_re : TYPE - Description - nan_vals : list - Description - product : TYPE - Description - """ - - product = tl.Enum(["L4RZSM"], default_value="L4RZSM") - date_url_re = re.compile("[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}") - datakey = tl.Unicode("sm1") - nan_vals = [-9999.0] - - def get_native_coordinates(self): - """Summary - - Returns - ------- - TYPE - Description - """ - try: - return self.load_cached_obj("native.coordinates") - except: - pass - - ds = self.dataset - base_date = ds["time"].attributes["units"] - base_date = self.date_url_re.search(base_date).group() - times = (ds["time"][:]).astype("timedelta64[h]") + np.array(base_date, "datetime64") - - lons = podpac.crange(ds["lon"][0], ds["lon"][-1], ds["lon"][1] - ds["lon"][0]) - lats = podpac.crange(ds["lat"][0], ds["lat"][-1], ds["lat"][1] - ds["lat"][0]) - coords = podpac.Coordinates([times, lats, lons], dims=["time", "lat", "lon"]) - self.cache_obj(coords, "native.coordinates") - - return coords - - def get_data(self, coordinates, coordinates_index): - """Summary - - Parameters - ---------- - coordinates : TYPE - Description - coordinates_index : TYPE - Description - - Returns - ------- - TYPE - Description - """ - data = self.dataset[self.datakey].array[tuple(coordinates_index)] - d = self.create_output_array(coordinates, data=data.reshape(coordinates.shape)) - return d - - -class AirMOSS_Site(podpac.OrderedCompositor): - """Summary - - Attributes - ---------- - base_dir_url : TYPE - Description - base_url : TYPE - Description - date_url_re : TYPE - Description - product : TYPE - Description - site : TYPE - Description - """ - - product = tl.Enum(["L4RZSM"], default_value="L4RZSM") - base_url = tl.Unicode("https://thredds.daac.ornl.gov/thredds/dodsC/ornldaac/1421") - base_dir_url = tl.Unicode("https://thredds.daac.ornl.gov/thredds/catalog/ornldaac/1421/catalog.html") - site = tl.Unicode("") - date_url_re = re.compile("[0-9]{8}") - - def get_native_coordinates(self): - """Summary - - Returns - ------- - TYPE - Description - """ - try: - return self.load_cached_obj("native.coordinates") - except: - pass - - ds = self.dataset - times = self.get_available_dates() - lons = podpac.crange(ds["lon"][0], ds["lon"][-1], ds["lon"][1] - ds["lon"][0]) - lats = podpac.crange(ds["lat"][0], ds["lat"][-1], ds["lat"][1] - ds["lat"][0]) - coords = podpac.Coordinates([times, lats, lons], dims=["time", "lat", "lon"]) - self.cache_obj(coords, "native.coordinates") - - return coords - - def get_available_dates(self): - """Summary - - Returns - ------- - TYPE - Description - """ - soup = BeautifulSoup(requests.get(self.base_dir_url).text, "lxml") - a = soup.find_all("a") - regex = self.date_url_re - - times = [] - for aa in a: - text = aa.get_text() - if self.site in text: - m = regex.search(text) - if m: - t = m.group() - times.append(np.datetime64("-".join([t[:4], t[4:6], t[6:]]))) - times.sort() - return np.array(times) - - -class AirMOSS(podpac.OrderedCompositor): - """Summary - - Attributes - ---------- - product : TYPE - Description - site_url_re : TYPE - Description - """ - - product = tl.Enum(["L4RZSM"], default_value="L4RZSM") - site_url_re = tl.Any() - - @tl.default("site_url_re") - def get_site_url_re(self): - """Summary - - Returns - ------- - TYPE - Description - """ - return re.compile(self.product + "_.*_" + "[0-9]{8}.*\.nc4") - - def get_available_sites(self): - """Summary - - Returns - ------- - TYPE - Description - """ - soup = BeautifulSoup(requests.get(self.base_dir_url).text, "lxml") - a = soup.find_all("a") - regex = self.site_url_re - - sites = OrderedDict() - for aa in a: - text = aa.get_text() - m = regex.match(text) - if m: - site = text.split("_")[1] - sites[site] = 1 + sites.get(site, 0) - - return sites - - -if __name__ == "__main__": - ams = AirMOSS_Site(interpolation="nearest_preview", site="BermsP") - print(ams.native_coordinates) - - source = "https://thredds.daac.ornl.gov/thredds/dodsC/ornldaac/1421/L4RZSM_BermsP_20121025_v5.nc4" - am = AirMOSS_Source(source=source, interpolation="nearest_preview") - coords = am.native_coordinates - print(coords) - print(coords["time"].area_bounds) - - lat, lon = am.native_coordinates.coords["lat"], am.native_coordinates.coords["lon"] - lat = lat[::10][np.isfinite(lat[::10])] - lon = lon[::10][np.isfinite(lon[::10])] - coords = podpac.Coordinates([lat, lon], dims=["lat", "lon"]) - o = am.eval(coords) - print("Done") diff --git a/podpac/datalib/cosmos_stations.py b/podpac/datalib/cosmos_stations.py new file mode 100644 index 000000000..ad1b68b17 --- /dev/null +++ b/podpac/datalib/cosmos_stations.py @@ -0,0 +1,402 @@ +from __future__ import division, unicode_literals, print_function, absolute_import + +from six import string_types +import traitlets as tl +import re +import numpy as np +from dateutil import parser +import requests +import json + +try: + import cPickle # Python 2.7 +except: + import _pickle as cPickle +from io import StringIO + +from podpac.core.utils import _get_from_url + +# Optional dependencies +from lazy_import import lazy_module + +bs4 = lazy_module("bs4") + +import podpac +from podpac.core.utils import cached_property + + +def _convert_str_to_vals(properties): + IGNORE_KEYS = ["sitenumber"] + for k, v in properties.items(): + if not isinstance(v, string_types) or k in IGNORE_KEYS: + continue + try: + if "," in v: + properties[k] = tuple([float(vv) for vv in v.split(",")]) + else: + properties[k] = float(v) + except ValueError: + try: + properties[k] = np.datetime64(v) + except ValueError: + pass + return properties + + +class COSMOSStation(podpac.data.DataSource): + _repr_keys = ["label", "network", "location"] + + url = tl.Unicode("http://cosmos.hwr.arizona.edu/Probes/StationDat/") + station_data = tl.Dict().tag(attr=True) + + @tl.default("interpolation") + def _interpolation_default(self): + return {"method": "nearest", "params": {"spatial_tolerance": 1.1, "time_tolerance": np.timedelta64(1, "D")}} + + @cached_property + def raw_data(self): + r = requests.get(self.station_data_url) + return r.text + + @cached_property + def data_columns(self): + return self.raw_data.split("\n", 1)[0].split(" ") + + @property + def site_number(self): + return str(self.station_data["sitenumber"]) + + @property + def station_data_url(self): + return self.url + self.site_number + "/smcounts.txt" + + @property + def station_calibration_url(self): + return self.url + self.site_number + "/calibrationInfo.php" + + @property + def station_properties_url(self): + return self.url + self.site_number + "/index.php" + + def get_data(self, coordinates, coordinates_index): + data = np.loadtxt(StringIO(self.raw_data), skiprows=1, usecols=self.data_columns.index("SOILM"))[ + coordinates_index[0] + ] + data[data > 100] = np.nan + data[data < 0] = np.nan + data /= 100.0 # Make it fractional + return self.create_output_array(coordinates, data=data[:, None, None]) + + def get_coordinates(self): + lat_lon = self.station_data["location"] + time = np.loadtxt( + StringIO(self.raw_data), + skiprows=1, + usecols=[self.data_columns.index("YYYY-MM-DD"), self.data_columns.index("HH:MM")], + dtype=str, + ) + time = np.array([t[0] + "T" + t[1] for t in time], np.datetime64) + c = podpac.Coordinates([time, lat_lon[0], lat_lon[1]], ["time", "lat", "lon"]) + return c + + @property + def label(self): + return self.station_data["label"] + + @property + def network(self): + return self.station_data["network"] + + @property + def location(self): + return self.station_data["location"] + + @cached_property(use_cache_ctrl=True) + def calibration_data(self): + cd = _get_from_url(self.station_calibration_url).json() + cd["items"] = [_convert_str_to_vals(i) for i in cd["items"]] + return cd + + @cached_property(use_cache_ctrl=True) + def site_properties(self): + r = _get_from_url(self.station_properties_url) + soup = bs4.BeautifulSoup(r.text, "lxml") + regex = re.compile("Soil Organic Carbon") + loc = soup.body.findAll(text=regex)[0].parent.parent + label, value = loc.findAll("div") + labels = [l.strip() for l in label.children if "br" not in str(l)] + values = [l.strip() for l in value.children if "br" not in str(l) and l.strip() != ""] + + properties = {k: v for k, v in zip(labels, values)} + + return _convert_str_to_vals(properties) + + +class COSMOSStations(podpac.compositor.OrderedCompositor): + url = tl.Unicode("http://cosmos.hwr.arizona.edu/Probes/") + stations_url = tl.Unicode("sitesNoLegend.js") + + ## PROPERTIES + @cached_property(use_cache_ctrl=True) + def _stations_data_raw(self): + url = self.url + self.stations_url + r = _get_from_url(url) + t = r.text + + # Fix the JSON + t_f = re.sub(':\s?",', ': "",', t) # Missing closing parenthesis + if t_f[-5:] == ",\n]}\n": # errant comma + t_f = t_f[:-5] + "\n]}\n" + + return t_f + + @cached_property + def stations_data(self): + stations = json.loads(self._stations_data_raw) + stations["items"] = [_convert_str_to_vals(i) for i in stations["items"]] + return stations + + @cached_property(use_cache_ctrl=True) + def source_coordinates(self): + lat_lon = np.array(self.stations_value("location")) + c = podpac.Coordinates([[lat_lon[:, 0], lat_lon[:, 1]]], ["lat_lon"]) + return c + + @cached_property + def sources(self): + return np.array([COSMOSStation(station_data=item) for item in self.stations_data["items"]]) + + @property + def available_data_keys(self): + return list(self.stations_data["items"][0].keys()) + + ## UTILITY FUNCTIONS + def stations_value(self, key, stations_data=None): + """ Returns a list of values for all the station for a particular key + + Parameters + ----------- + key: str + Key describing the station data. See self.available_data_keys for available keys. + + Returns + -------- + list + A list of the values for the keys for each station + """ + if key not in self.available_data_keys: + raise ValueError("Input key {} is not in available keys {}".format(key, self.available_data_keys)) + + return self._stations_value(key, stations_data) + + def _stations_value(self, key, stations_data=None): + """ helper function for stations_value + """ + if stations_data is None: + stations_data = self.stations_data + + return [i[key] for i in stations_data["items"]] + + @property + def stations_label(self): + return self.stations_value("label") + + def label_from_latlon(self, lat_lon): + """ Returns the COSMOS station's label given it's lat/lon coordinates + + Parameters + ----------- + lat_lon : podpac.Coordinates + The lat/lon locations whose station name will be returned. Note, the lat/lon coordinates have to match + exactly the coordinates given in station_data[N]['location'], where N is the station. + This should be Coordinates object with 'lat_lon' stacked coordinates as one of the dimensions. + + Returns + -------- + list + List of COSMOS station names corresponding to the given coordinates. If a coordinate has no match, then + "None" is returned. + """ + if "lon_lat" in lat_lon.dims: + lat_lon = lat_lon.transpose("lon_lat") + elif "lat_lon" not in lat_lon.dims: + raise ValueError("The coordinates object must have a stacked 'lat_lon' dimension.") + + labels_map = {s["location"]: s["label"] for s in self.stations_data["items"]} + labels = [labels_map.get(ll, None) for ll in lat_lon.coords["lat_lon"]] + return labels + + def latlon_from_label(self, label): + """ Returns the lat/lon coordinates of COSMOS stations that match the given labels + + Parameters + ------------ + label: str, list + Strings that partially describe a COSMOS station label. + + Returns + -------- + podpac.Coordinates + The coordinates of the COSMOS stations matching the input data + """ + if not isinstance(label, list): + label = [label] + + ind = self._get_label_inds(label) + if ind.size == 0: + return podpac.Coordinates([]) # Empty + + return self.source_coordinates[ind] + + def _get_label_inds(self, label): + """ Helper function to get source indices for partially matched labels """ + ind = [] + for lab in label: + ind.extend([i for i, l in enumerate(self.stations_label) if lab.lower() in l.lower()]) + + ind = np.unique(ind) + return ind + + def get_calibration_data(self, label=None, lat_lon=None): + """ Returns the calibration information for a station. Users must supply a label or lat_lon coordinates. + + Parameters + ------------ + label: str, List (optional) + Labels describing the station. + + lat_lon: podpac.Coordinates (optional) + Coordinates of the COSMOS station. Note, this object has to have a 'lat_lon' dimension which matches exactly + with the COSMOS stations. + + Returns + -------- + list + A list of dictionaries containing the calibration data for the requested stations. + """ + + if label is None and lat_lon is None: + raise ValueError("Must supply either 'label' or 'lat_lon'") + + if lat_lon is not None: + label = self.label_from_latlon(lat_lon) + + if isinstance(label, string_types): + label = [label] + + inds = self._get_label_inds(label) + + return [self.sources[i].calibration_data for i in inds] + + def get_site_properties(self, label=None, lat_lon=None): + """ Returns the site properties for a station. Users must supply a label or lat_lon coordinates. + + Parameters + ------------ + label: str, List (optional) + Labels describing the station. + + lat_lon: podpac.Coordinates (optional) + Coordinates of the COSMOS station. Note, this object has to have a 'lat_lon' dimension which matches exactly + with the COSMOS stations. + + Returns + -------- + list + A list of dictionaries containing the properties for the requested stations. + """ + + if label is None and lat_lon is None: + raise ValueError("Must supply either 'label' or 'lat_lon'") + + if lat_lon is not None: + label = self.label_from_latlon(lat_lon) + + if isinstance(label, string_types): + label = [label] + + inds = self._get_label_inds(label) + + return [self.sources[i].site_properties for i in inds] + + def get_station_data(self, label=None, lat_lon=None): + """ Returns the station data. Users must supply a label or lat_lon coordinates. + + Parameters + ------------ + label: str, List (optional) + Labels describing the station. + + lat_lon: podpac.Coordinates (optional) + Coordinates of the COSMOS station. Note, this object has to have a 'lat_lon' dimension which matches exactly + with the COSMOS stations. + + Returns + -------- + list + A list of dictionaries containing the data for the requested stations. + """ + + if label is None and lat_lon is None: + raise ValueError("Must supply either 'label' or 'lat_lon'") + + if lat_lon is not None: + label = self.label_from_latlon(lat_lon) + + if isinstance(label, string_types): + label = [label] + + inds = self._get_label_inds(label) + + return [self.stations_data["items"][i] for i in inds] + + +if __name__ == "__main__": + bounds = {"lat": [40, 46], "lon": [-78, -68]} + cs = COSMOSStations(cache_ctrl=["ram", "disk"]) + + sd = cs.stations_data + ci = cs.source_coordinates.select(bounds) + ce = podpac.coordinates.merge_dims( + [podpac.Coordinates([podpac.crange("2018-05-01", "2018-06-01", "1,D", "time")]), ci] + ) + o = cs.eval(ce) + + # Test helper functions + labels = cs.stations_label + lat_lon = cs.latlon_from_label("Manitou") + labels = cs.label_from_latlon(lat_lon) + lat_lon2 = cs.latlon_from_label("No Match Here") + cal = cs.get_calibration_data("Manitou") + props = cs.get_site_properties("Manitou") + + from matplotlib import rcParams + + rcParams["axes.labelsize"] = 12 + rcParams["xtick.labelsize"] = 10 + rcParams["ytick.labelsize"] = 10 + rcParams["legend.fontsize"] = 8 + rcParams["lines.linewidth"] = 2 + rcParams["font.size"] = 12 + + import matplotlib.pyplot as plt + import matplotlib.dates as mdates + from pandas.plotting import register_matplotlib_converters + + register_matplotlib_converters() + + fig = plt.figure(figsize=(6.5, 3), dpi=300) + plt.plot(o.time, o.data, "o-") + ax = plt.gca() + plt.ylim(0, 1) + plt.legend(cs.label_from_latlon(ce)) + plt.ylabel("Soil Moisture ($m^3/m^3$)") + plt.xlabel("Date") + # plt.xticks(rotation=90) + fig.autofmt_xdate() + ax.fmt_xdata = mdates.DateFormatter("%m-%d") + plt.title("COSMOS Data for 2018 over lat (40, 46) by lon (-78,-68)") + plt.tight_layout() + plt.show() + + print("Done") diff --git a/podpac/datalib/drought_monitor.py b/podpac/datalib/drought_monitor.py index 9b94581fc..6c1d00f3b 100644 --- a/podpac/datalib/drought_monitor.py +++ b/podpac/datalib/drought_monitor.py @@ -1,9 +1,7 @@ -from podpac.core.node import Node -from podpac.core.style import Style -from podpac.core.utils import NodeTrait -from podpac.core.algorithm.algorithm import Algorithm -from podpac.core.data.file import Zarr -from podpac.core.coordinates import ArrayCoordinates1d +from podpac.algorithm import Algorithm +from podpac.data import Zarr +from podpac.style import Style +from podpac.utils import NodeTrait def drought_style(): @@ -25,7 +23,6 @@ def sm_style(): class DroughtMonitorCategory(Zarr): - # dims = ["lat", "lon", "time"] cf_time = True cf_units = "days since 2018-01-01 00:00:00" cf_calendar = "proleptic_gregorian" @@ -33,12 +30,12 @@ class DroughtMonitorCategory(Zarr): class DroughtCategory(Algorithm): - soil_moisture = NodeTrait() - d0 = NodeTrait() - d1 = NodeTrait() - d2 = NodeTrait() - d3 = NodeTrait() - d4 = NodeTrait() + soil_moisture = NodeTrait().tag(attr=True) + d0 = NodeTrait().tag(attr=True) + d1 = NodeTrait().tag(attr=True) + d2 = NodeTrait().tag(attr=True) + d3 = NodeTrait().tag(attr=True) + d4 = NodeTrait().tag(attr=True) style = drought_style() def algorithm(self, inputs): @@ -61,29 +58,55 @@ def algorithm(self, inputs): if __name__ == "__main__": + import os + import numpy as np import podpac c = podpac.Coordinates([46.6, -123.5, "2018-06-01"], dims=["lat", "lon", "time"]) # local path = "droughtmonitor/beta_parameters.zarr" - d0 = DroughtMonitorCategory(source=path, datakey="d0") - print(d0.native_coordinates) - print(d0.eval(c)) + if not os.path.exists(path): + print("No local drought monitor data found at '%s'" % path) + else: + # drought monitor parameters + d0 = DroughtMonitorCategory(source=path, data_key="d0") + print(d0.coordinates) + print(d0.eval(c)) + + # drought category + mock_sm = podpac.data.Array(data=np.random.random(d0.coordinates.shape), coordinates=d0.coordinates) + + category = DroughtCategory( + soil_moisture=mock_sm, + d0=DroughtMonitorCategory(source=path, data_key="d0"), + d1=DroughtMonitorCategory(source=path, data_key="d1"), + d2=DroughtMonitorCategory(source=path, data_key="d2"), + d3=DroughtMonitorCategory(source=path, data_key="d3"), + d4=DroughtMonitorCategory(source=path, data_key="d4"), + ) + print(category.eval(c)) # s3 bucket = "podpac-internal-test" store = "drought_parameters.zarr" path = "s3://%s/%s" % (bucket, store) - d0 = DroughtMonitorCategory(source=path, datakey="d0") - print(d0.native_coordinates) - print(d0.eval(c)) - - # the Zarr node uses the podpac AWS settings by default, but credentials can be explicitly provided, too - d0 = DroughtMonitorCategory( - source=path, - datakey="d0", - access_key_id=podpac.settings["AWS_ACCESS_KEY_ID"], - secret_access_key=podpac.settings["AWS_SECRET_ACCESS_KEY"], - region_name=podpac.settings["AWS_REGION_NAME"], - ) + d0 = DroughtMonitorCategory(source=path, data_key="d0") + if not d0.s3.exists(path): + print("No drought monitor data found at '%s'. Check your AWS credentials." % path) + else: + print(d0.coordinates) + print(d0.eval(c)) + + # drought category algorithm + mock_sm = podpac.data.Array(data=np.random.random(d0.coordinates.shape), coordinates=d0.coordinates) + + category = DroughtCategory( + soil_moisture=mock_sm, + d0=DroughtMonitorCategory(source=path, data_key="d0"), + d1=DroughtMonitorCategory(source=path, data_key="d1"), + d2=DroughtMonitorCategory(source=path, data_key="d2"), + d3=DroughtMonitorCategory(source=path, data_key="d3"), + d4=DroughtMonitorCategory(source=path, data_key="d4"), + ) + print(category.eval(c)) diff --git a/podpac/datalib/egi.py b/podpac/datalib/egi.py index f6497109b..5bc086160 100644 --- a/podpac/datalib/egi.py +++ b/podpac/datalib/egi.py @@ -29,6 +29,7 @@ from podpac.data import DataSource from podpac import authentication from podpac import settings +from podpac import cached_property from podpac.core.units import UnitsDataArray from podpac.core.node import node_eval @@ -84,10 +85,10 @@ class EGI(DataSource): If this setting is not defined, the node will attempt to generate a token using :attr:`self.username` and :attr:`self.password` username : str, optional - EarthData username (https://urs.earthdata.nasa.gov/) + Earthdata username (https://urs.earthdata.nasa.gov/) If undefined, node will look for a username under setting key "username@urs.earthdata.nasa.gov" password : str, optional - EarthData password (https://urs.earthdata.nasa.gov/) + Earthdata password (https://urs.earthdata.nasa.gov/) If undefined, node will look for a password under setting key "password@urs.earthdata.nasa.gov" Attributes @@ -96,11 +97,7 @@ class EGI(DataSource): The data array compiled from downloaded EGI data """ - base_url = tl.Unicode().tag(attr=True) - - @tl.default("base_url") - def _base_url_default(self): - return BASE_URL + base_url = tl.Unicode(default_value=BASE_URL).tag(attr=True) # required short_name = tl.Unicode().tag(attr=True) @@ -169,7 +166,7 @@ def coverage(self): data = tl.Any(allow_none=True) _url = tl.Unicode(allow_none=True) - @property + @cached_property def source(self): """ URL Endpoint built from input parameters @@ -201,13 +198,13 @@ def _append(u, key, val): # other parameters are included at eval time return url - def get_native_coordinates(self): - if self.data is not None: - return Coordinates.from_xarray(self.data.coords, crs=self.data.attrs["crs"]) - else: + def get_coordinates(self): + if self.data is None: _log.warning("No coordinates found in EGI source") return Coordinates([], dims=[]) + return Coordinates.from_xarray(self.data.coords, crs=self.data.attrs["crs"]) + def get_data(self, coordinates, coordinates_index): if self.data is not None: da = self.data[coordinates_index] @@ -229,8 +226,6 @@ def eval(self, coordinates, output=None): " which case EGI is returning no data." ) raise e - # Force update on native_coordinates (in case of multiple evals) - self.set_trait("native_coordinates", self.get_native_coordinates()) # run normal eval once self.data is prepared return super(EGI, self).eval(coordinates, output) @@ -313,6 +308,7 @@ def _download(self, coordinates): zipfile.ZipFile Returns zip file byte-str to downloaded data """ + # Ensure Coordinates are in decimal lat-lon coordinates = coordinates.transform("epsg:4326") self._authenticate() @@ -499,7 +495,7 @@ def token_valid(self): def get_token(self): """ - Get token for EGI interface using EarthData credentials + Get token for EGI interface using Earthdata credentials Returns ------- @@ -509,7 +505,7 @@ def get_token(self): Raises ------ ValueError - Raised if EarthData username or password is unavailable + Raised if Earthdata username or password is unavailable """ # token access URL url = "https://cmr.earthdata.nasa.gov/legacy-services/rest/tokens" @@ -517,12 +513,12 @@ def get_token(self): if self.username is not None: settings["username@EGI"] = self.username else: - raise ValueError("No EarthData username available to request EGI token") + raise ValueError("No Earthdata username available to request EGI token") if self.password is not None: settings["password@EGI"] = self.password else: - raise ValueError("No EarthData password available to request EGI token") + raise ValueError("No Earthdata password available to request EGI token") _ip = self._get_ip() request = """ diff --git a/podpac/datalib/gfs.py b/podpac/datalib/gfs.py index 964770126..b65147a47 100644 --- a/podpac/datalib/gfs.py +++ b/podpac/datalib/gfs.py @@ -1,138 +1,152 @@ from __future__ import division, unicode_literals, print_function, absolute_import -import logging import datetime import traitlets as tl import numpy as np -# Helper utility for optional imports from lazy_import import lazy_module -# Optional Imports -rasterio = lazy_module("rasterio") -boto3 = lazy_module("boto3") -botocore = lazy_module("botocore") +s3fs = lazy_module("s3fs") # Internal imports from podpac.data import DataSource, Rasterio from podpac.coordinates import Coordinates, merge_dims +from podpac.utils import cached_property, DiskCacheMixin +from podpac.core.authentication import S3Mixin BUCKET = "noaa-gfs-pds" -s3 = boto3.resource("s3") -s3.meta.client.meta.events.register("choose-signer.s3.*", botocore.handlers.disable_signing) -bucket = s3.Bucket(BUCKET) -# TODO add time to native_coordinates -class GFSSource(Rasterio): +class GFSSource(DiskCacheMixin, Rasterio): parameter = tl.Unicode().tag(attr=True) level = tl.Unicode().tag(attr=True) date = tl.Unicode().tag(attr=True) hour = tl.Unicode().tag(attr=True) forecast = tl.Unicode().tag(attr=True) - def init(self): - self._logger = logging.getLogger(__name__) - - # check if the key exists - try: - s3.Object(BUCKET, self._key).load() - except botocore.exceptions.ClientError as e: - if e.response["Error"]["Code"] == "404": - raise ValueError("Not found: '%s'" % self._key) # TODO list options - else: - raise - - @property - def _key(self): - return "%s/%s/%s/%s/%s" % (self.parameter, self.level, self.date, self.hour, self.forecast) - - @tl.default("nan_vals") - def _get_nan_vals(self): - return [self.dataset.nodata] - # return list(self.dataset.nodatavals) # which? - @property def source(self): - return self._key - - @tl.default("dataset") - def open_dataset(self): - """Opens the data source""" - - cache_key = "fileobj" - with rasterio.MemoryFile() as f: - if self.cache_ctrl and self.has_cache(key=cache_key): - data = self.get_cache(key=cache_key) - f.write(data) - else: - self._logger.info("Downloading S3 fileobj (Bucket: %s, Key: %s)" % (BUCKET, self._key)) - s3.Object(BUCKET, self._key).download_fileobj(f) - f.seek(0) - self.cache_ctrl and self.put_cache(f.read(), key=cache_key) - f.seek(0) - - dataset = f.open() - - return dataset + return "s3://%s/%s/%s/%s/%s/%s" % (BUCKET, self.parameter, self.level, self.date, self.hour, self.forecast) -class GFS(DataSource): +# TODO time interpolation +class GFS(S3Mixin, DiskCacheMixin, DataSource): parameter = tl.Unicode().tag(attr=True) level = tl.Unicode().tag(attr=True) date = tl.Unicode().tag(attr=True) hour = tl.Unicode().tag(attr=True) - @property - def source(self): - return "%s/%s/%s/%s" % (self.parameter, self.level, self.date, self.hour) + cache_coordinates = tl.Bool(True) - def init(self): - # TODO check prefix and the options at the next level - - self._prefix = "%s/%s/%s/%s/" % (self.parameter, self.level, self.date, self.hour) - self.forecasts = [obj.key.replace(self._prefix, "") for obj in bucket.objects.filter(Prefix=self._prefix)] + @property + def prefix(self): + return "%s/%s/%s/%s/%s/" % (BUCKET, self.parameter, self.level, self.date, self.hour) - if not self.forecasts: - raise ValueError("Not found: '%s/*'" % self._prefix) + @cached_property(use_cache_ctrl=True) + def forecasts(self): + return [path.replace(self.prefix, "") for path in self.s3.find(self.prefix)] + @cached_property + def sources(self): params = { "parameter": self.parameter, "level": self.level, "date": self.date, "hour": self.hour, "cache_ctrl": self.cache_ctrl, + "s3": self.s3, } - self._sources = np.array([GFSSource(forecast=h, **params) for h in self.forecasts]) # can we load this lazily? + return np.array([GFSSource(forecast=forecast, **params) for forecast in self.forecasts]) - nc = self._sources[0].native_coordinates + def get_coordinates(self): + nc = self.sources[0].coordinates base_time = datetime.datetime.strptime("%s %s" % (self.date, self.hour), "%Y%m%d %H%M") forecast_times = [base_time + datetime.timedelta(hours=int(h)) for h in self.forecasts] - tc = Coordinates([[dt.strftime("%Y-%m-%d %H:%M") for dt in forecast_times]], dims=["time"], crs=nc.crs) - self.set_trait("native_coordinates", merge_dims([nc, tc])) + tc = Coordinates( + [[dt.strftime("%Y-%m-%d %H:%M") for dt in forecast_times]], dims=["time"], crs=nc.crs, validate_crs=False + ) + return merge_dims([nc, tc]) def get_data(self, coordinates, coordinates_index): data = self.create_output_array(coordinates) - for i, source in enumerate(self._sources[coordinates_index[2]]): + for i, source in enumerate(self.sources[coordinates_index[2]]): data[:, :, i] = source.eval(coordinates.drop("time")) return data -class GFSLatest(GFS): - # TODO raise exception if date or hour is in init args - - def init(self): - now = datetime.datetime.now() - - # date - self.set_trait("date", now.strftime("%Y%m%d")) - - # hour - prefix = "%s/%s/%s/" % (self.parameter, self.level, self.date) - objs = bucket.objects.filter(Prefix=prefix) - hours = set(obj.key.split("/")[3] for obj in objs) - if hours: - self.set_trait("hour", max(hours)) - - super(GFSLatest, self).init() +def GFSLatest(parameter=None, level=None, **kwargs): + # date + date = datetime.datetime.now().strftime("%Y%m%d") + + # hour + prefix = "%s/%s/%s/%s/" % (BUCKET, parameter, level, date) + s3 = s3fs.S3FileSystem(anon=True) + hours = set([path.replace(prefix, "")[:4] for path in s3.find(prefix)]) + if not hours: + raise RuntimeError("No data found at '%s'" % prefix) + hour = max(hours) + + # node + return GFS(parameter=parameter, level=level, date=date, hour=hour, **kwargs) + + +if __name__ == "__main__": + import datetime + import podpac + + # switch to 'disk' cache to cache s3 data + cache_ctrl = ["ram"] + # cache_ctrl = ['ram', 'disk'] + + parameter = "SOIM" + level = "0-10 m DPTH" + + now = datetime.datetime.now() + yesterday = now - datetime.timedelta(1) + tomorrow = now + datetime.timedelta(1) + + # GFSSource (specify source date/time and forecast) + print("GFSSource node (parameter, level, date, hour)") + gfs_soim = GFSSource( + parameter=parameter, + level=level, + date=yesterday.strftime("%Y%m%d"), + hour="1200", + forecast="003", + cache_ctrl=cache_ctrl, + anon=True, + ) + + o = gfs_soim.eval(gfs_soim.coordinates) + print(o) + + # GFS (specify source date/time, select forecast at evaluation) + print("GFS node (parameter, level, date, hour)") + gfs_soim = GFS( + parameter=parameter, + level=level, + date=yesterday.strftime("%Y%m%d"), + hour="1200", + cache_ctrl=cache_ctrl, + anon=True, + ) + + # whole world forecast at this time tomorrow + c = Coordinates([gfs_soim.coordinates["lat"], gfs_soim.coordinates["lon"], tomorrow], dims=["lat", "lon", "time"]) + o = gfs_soim.eval(c) + print(o) + + # time series: get the forecast at lat=42, lon=275 every hour for the next 6 hours + start = now + stop = now + datetime.timedelta(hours=6) + c = Coordinates([42, 282, podpac.crange(start, stop, "1,h")], dims=["lat", "lon", "time"]) + o = gfs_soim.eval(c) + print(o) + + # latest (get latest source, select forecast at evaluation) + print("GFSLatest node (parameter, level)") + gfs_soim = GFSLatest(parameter=parameter, level=level, cache_ctrl=cache_ctrl, anon=True) + c = Coordinates([gfs_soim.coordinates["lat"], gfs_soim.coordinates["lon"], tomorrow], dims=["lat", "lon", "time"]) + o = gfs_soim.eval(c) + print(o) diff --git a/podpac/datalib/intake.py b/podpac/datalib/intake_catalog.py similarity index 83% rename from podpac/datalib/intake.py rename to podpac/datalib/intake_catalog.py index 0be222f4b..6d6d54155 100644 --- a/podpac/datalib/intake.py +++ b/podpac/datalib/intake_catalog.py @@ -12,6 +12,7 @@ # Internal imports import podpac from podpac import Coordinates +from podpac.utils import cached_property intake = lazy_module("intake") # lazy_module('intake.catalog.local.LocalCatalogEntry') @@ -33,7 +34,7 @@ class IntakeCatalog(podpac.data.DataSource): If source is a dataframe with multiple fields, this specifies the field to use for analysis.for Can be defined in the metadata in the intake catalog source. dims : dict, optional - Dictionary defining the native coordinates dimensions in the intake catalog source. + Dictionary defining the coordinates dimensions in the intake catalog source. Keys are the podpac dimensions (lat, lon, time, alt) in stacked or unstacked form. Values are the identifiers which locate the coordinates in the datasource. Can be defined in the metadata in the intake catalog source. @@ -42,7 +43,7 @@ class IntakeCatalog(podpac.data.DataSource): {'lat_lon': ['lat column', 'lon column']} {'time': 'time'} crs : str, optional - Coordinate reference system of the native coordinates. + Coordinate reference system of the coordinates. Can be defined in the metadata in the intake catalog source. @@ -57,7 +58,7 @@ class IntakeCatalog(podpac.data.DataSource): """ # input parameters - source = tl.Unicode().tag(readonly=True) + source = tl.Unicode().tag(attr=True) uri = tl.Unicode() # optional input parameters @@ -65,18 +66,12 @@ class IntakeCatalog(podpac.data.DataSource): dims = tl.Dict(default_value=None, allow_none=True) crs = tl.Unicode(default_value=None, allow_none=True) - # attributes - catalog = tl.Any() # This should be lazy-loaded, but haven't problems with that currently - # tl.Instance(intake.catalog.Catalog) - datasource = tl.Any() # Same as above - # datasource = tl.Instance(intake.catalog.local.LocalCatalogEntry) - - @tl.default("catalog") - def _default_catalog(self): + @cached_property + def catalog(self): return intake.open_catalog(self.uri) - @tl.default("datasource") - def _default_datasource(self): + @cached_property + def datasource(self): return getattr(self.catalog, self.source) # TODO: validators may not be necessary @@ -121,8 +116,8 @@ def _validate_dims(self, proposed): return dims - def get_native_coordinates(self): - """Get native coordinates from catalog definition or input dims + def get_coordinates(self): + """Get coordinates from catalog definition or input dims """ # look for dims in catalog @@ -130,7 +125,7 @@ def get_native_coordinates(self): if "dims" in self.datasource.metadata: self.dims = self.datasource.metadata["dims"] else: - raise ValueError("No native coordinates dims defined in catalog or input") + raise ValueError("No coordinates dims defined in catalog or input") # look for crs in catalog if self.crs is None: @@ -179,3 +174,26 @@ def get_data(self, coordinates, coordinates_index): # create UnitDataArray with subselected data (idx) uda = self.create_output_array(coordinates, data=data[coordinates_index]) return uda + + +if __name__ == "__main__": + node = IntakeCatalog( + uri="../podpac-examples/notebooks/demos/intake/precip/catalog.yml", # path to catalog + source="southern_rockies", # name of the source within catalog + field="precip", # this can be defined in catalog source metadata + dims={"time": "time"}, # this can be defined in catalog source metadata + ) + + print("catalog") + print(node.catalog) + + print("datasource") + print(node.datasource) + + print("coordinates") + print(node.coordinates) + + print("eval") + print(node.eval(node.coordinates)) + + print("done") diff --git a/podpac/datalib/modis.py b/podpac/datalib/modis.py new file mode 100644 index 000000000..0ba69fe4d --- /dev/null +++ b/podpac/datalib/modis.py @@ -0,0 +1,90 @@ +""" +MODIS on AWS OpenData +""" + +import logging + +import traitlets as tl +import rasterio + +from podpac.data import Rasterio +from podpac.core import cache +from podpac.utils import DiskCacheMixin +from podpac.authentication import S3Mixin + +BUCKET = "modis-pds" +PRODUCTS = ["MCD43A4.006", "MOD09GA.006", "MYD09GA.006", "MOD09GQ.006", "MYD09GQ.006"] + + +class MODISSource(DiskCacheMixin, Rasterio): + """ + Individual MODIS data tile using AWS OpenData, with caching. + + Attributes + ---------- + product : str + MODIS product ('MCD43A4.006', 'MOD09GA.006', 'MYD09GA.006', 'MOD09GQ.006', or 'MYD09GQ.006') + horizontal_grid : str + horizontal grid in the MODIS Sinusoidal Tiling System, e.g. '21' + vertical_grid : str + vertical grid in the MODIS Sinusoidal Tiling System, e.g. '07' + date : str + year and three-digit day of year, e.g. '2011460' + data : str + individual object (varies by product) + """ + + product = tl.Enum(values=PRODUCTS, help="MODIS product ID") + horizontal_grid = tl.Unicode(help="horizontal grid in the MODIS Sinusoidal Tiling System, e.g. '21'") + vertical_grid = tl.Unicode(help="vertical grid in the MODIS Sinusoidal Tiling System, e.g. '07'") + date = tl.Unicode(help="year and three-digit day of year, e.g. '2011460'") + data = tl.Unicode(help="individual object (varies by product)") + + def init(self): + self._logger = logging.getLogger(__name__) + + if not self.s3.exists(self.source): + raise ValueError("No S3 object found at '%s'" % self.source) + + @property + def _key(self): + return "%s/%s/%s/%s/%s" % (self.product, self.horizontal_grid, self.vertical_grid, self.date, self.data) + + @property + def source(self): + return "s3://%s/%s" % (BUCKET, self._key) + + +class MODIS(object): + """ Future MODIS node. """ + + product = tl.Enum(values=PRODUCTS, help="MODIS product ID") + + @staticmethod + def available(product=None, horizontal_grid=None, vertical_grid=None, date=None, data=None): + prefix = [BUCKET] + for value in [product, horizontal_grid, vertical_grid, date, data]: + if value is None: + break + prefix.append(value) + else: + return None + + prefix = "/".join(prefix) + objs = self.s3.ls(prefix) + return [obj.replace(prefix + "/", "") for obj in objs if "_scenes.txt" not in obj] + + +if __name__ == "__main__": + + source = MODISSource( + product=PRODUCTS[0], + horizontal_grid="01", + vertical_grid="11", + date="2020009", + data="MCD43A4.A2020009.h01v11.006.2020018035627_B01.TIF", + ) + + print(source.source) + # print(source.coordinates) + # print(source.eval(source.coordinates[:10, :10])) diff --git a/podpac/datalib/nasaCMR.py b/podpac/datalib/nasaCMR.py index e01532ae1..b38dd0885 100644 --- a/podpac/datalib/nasaCMR.py +++ b/podpac/datalib/nasaCMR.py @@ -3,39 +3,31 @@ """ from __future__ import division, unicode_literals, print_function, absolute_import -import warnings -from copy import deepcopy from collections import OrderedDict -from six import string_types - -import requests -import os import json import logging -_logger = logging.getLogger(__name__) +import requests +from six import string_types +_logger = logging.getLogger(__name__) -import numpy as np -import traitlets as tl +from podpac.core.utils import _get_from_url CMR_URL = r"https://cmr.earthdata.nasa.gov/search/" -def get_collection_id(auth_session=None, short_name=None, keyword=None, **kwargs): +def get_collection_id(session=None, short_name=None, keyword=None, **kwargs): """ Users NASA CMR to retrieve metadata about a data collection Parameters ----------- - auth_session: podpac.core.authentication.Session, optional + session: :class:`requets.Session`, optional An authenticated Earthdata login session - short_name: str, optional The short name of the dataset - keyword: str, optional Any keyword search parameters - **kwargs: str, optional Any additional query parameters @@ -59,10 +51,11 @@ def get_collection_id(auth_session=None, short_name=None, keyword=None, **kwargs query_string = "&".join([k + "=" + v for k, v in kwargs.items()]) - if auth_session is None: - auth_session = requests + # use generic requests session if `session` is not defined + if session is None: + session = requests - json_ = _get_from_url(auth_session, base_url + query_string) + json_ = _get_from_url(base_url + query_string, session) pydict = json.loads(json_) @@ -75,17 +68,15 @@ def get_collection_id(auth_session=None, short_name=None, keyword=None, **kwargs return collection_id -def search_granule_json(auth_session=None, entry_map=None, **kwargs): +def search_granule_json(session=None, entry_map=None, **kwargs): """ Search for specific files from NASA CMR for a particular collection Parameters ----------- - auth_session: podpac.core.authentication.Session, optional + session: :class:`requets.Session`, optional An authenticated Earthdata login session - entry_map: function A function applied to each individual entry. Could be used to filter out certain data in an entry - **kwargs: dict Additional query string parameters. At minimum the provider, provider_id, concept_id, collection_concept_id, short_name, version, or entry_title @@ -98,7 +89,20 @@ def search_granule_json(auth_session=None, entry_map=None, **kwargs): """ base_url = CMR_URL + "granules.json?" - if "collection_id" not in kwargs and "short_name" not in kwargs: + if not np.any( + [ + m not in kwargs + for m in [ + "provider", + "provider_id", + "concept_id", + "collection_concept_id", + "short_name", + "version", + "entry_title", + ] + ] + ): raise ValueError( "Need to provide either" " provider, provider_id, concept_id, collection_concept_id, short_name, version or entry_title" @@ -113,27 +117,27 @@ def search_granule_json(auth_session=None, entry_map=None, **kwargs): query_string = "&".join([k + "=" + str(v) for k, v in kwargs.items()]) - if auth_session is None: - auth_session = requests + if session is None: + session = requests url = base_url + query_string if "page_num" not in kwargs: - entries = _get_all_granule_pages(auth_session, url, entry_map) + entries = _get_all_granule_pages(session, url, entry_map) else: - granules = _get_from_url(auth_session, url) + granules = _get_from_url(url, session) pydict = json.loads(granules) entries = list(map(entry_map, pydict["feed"]["entry"])) return entries -def _get_all_granule_pages(auth_session, url, entry_map, max_paging_depth=1000000): +def _get_all_granule_pages(session, url, entry_map, max_paging_depth=1000000): """ Helper function for searching through all pages for a collection. Parameters ----------- - auth_session: podpac.core.authentication.EarthDataSession - Authenticated EDS session + session: :class:`requets.Session`, optional + An authenticated Earthdata login session url: str URL to website entry_map: function @@ -143,40 +147,13 @@ def _get_all_granule_pages(auth_session, url, entry_map, max_paging_depth=100000 page_size = int([q for q in url.split("?")[1].split("&") if "page_size" in q][0].split("=")[1]) max_pages = int(max_paging_depth / page_size) - pydict = json.loads(_get_from_url(auth_session, url)) + pydict = json.loads(_get_from_url(url, session)) entries = list(map(entry_map, pydict["feed"]["entry"])) for i in range(1, max_pages): page_url = url + "&page_num=%d" % (i + 1) - page_entries = json.loads(_get_from_url(auth_session, page_url))["feed"]["entry"] + page_entries = json.loads(_get_from_url(page_url, session))["feed"]["entry"] if not page_entries: break entries.extend(list(map(entry_map, page_entries))) return entries - - -def _get_from_url(auth_session, url): - """Helper function to get data from an url with error checking. - - Parameters - ----------- - auth_session: podpac.core.authentication.EarthDataSession - Authenticated EDS session - url: str - URL to website - """ - try: - r = auth_session.get(url) - if r.status_code != 200: - _logger.warning( - "Could not connect to {}, status code {}. \n *** Return Text *** \n {} \n *** End Return Text ***".format( - url, r.status_code, r.text - ) - ) - - except requests.ConnectionError as e: - _logger.warning("Cannot connect to {}:".format(url) + str(e)) - r = None - except RuntimeError as e: - _logger.warning("Cannot authenticate to {}. Check credentials. Error was as follows:".format(url) + str(e)) - return r.text diff --git a/podpac/datalib/smap.py b/podpac/datalib/smap.py index aa0940d4c..c6b29bb15 100644 --- a/podpac/datalib/smap.py +++ b/podpac/datalib/smap.py @@ -46,51 +46,36 @@ def isnat(a): # Internal dependencies import podpac -from podpac.core.coordinates import Coordinates, union, merge_dims, concat -from podpac.core.data import pydap_source -from podpac.core import authentication -from podpac.core.utils import common_doc +from podpac import NodeException +from podpac import authentication +from podpac.coordinates import Coordinates, merge_dims +from podpac.data import PyDAP +from podpac.utils import cached_property, DiskCacheMixin +from podpac.compositor import OrderedCompositor from podpac.core.data.datasource import COMMON_DATA_DOC -from podpac.core.node import cache_func -from podpac.core.node import NodeException -from podpac.core import cache +from podpac.core.utils import common_doc, _get_from_url -from . import nasaCMR +from podpac.datalib import nasaCMR COMMON_DOC = COMMON_DATA_DOC.copy() COMMON_DOC.update( { "smap_date": "str\n SMAP date string", "np_date": "np.datetime64\n Numpy date object", - "auth_class": ( - "EarthDataSession (Class object)\n Class used to make an authenticated session from a" - " username and password (both are defined in base class)" - ), - "auth_session": ( - "Instance(EarthDataSession)\n Authenticated session used to make http requests using" - "NASA Earth Data Login credentials" - ), "base_url": "str\n Url to nsidc openDAP server", - "layerkey": ( + "layer_key": ( "str\n Key used to retrieve data from OpenDAP dataset. This specifies the key used to retrieve " "the data" ), - "password": "User's EarthData password", - "username": "User's EarthData username", "product": "SMAP product name", "version": "Version number for the SMAP product", - "source_coordinates": """Returns the coordinates that uniquely describe each source - - Returns - ------- - :class:`podpac.Coordinates` - Coordinates that uniquely describe each source""", + "source_coordinates": "Coordinates that uniquely describe each source", "keys": """Available layers that are in the OpenDAP dataset Returns ------- List - The list of available keys from the OpenDAP dataset. Any of these keys can be set as self.datakey. + The list of available keys from the OpenDAP dataset. Any of these keys can be set as self.data_key. Notes ----- @@ -139,34 +124,7 @@ def np2smap_date(date): return date -def _get_from_url(url, auth_session): - """Helper function to get data from an NSIDC url with error checking. - - Parameters - ----------- - url: str - URL to website - auth_session: podpac.core.authentication.EarthDataSession - Authenticated EDS session - """ - try: - r = auth_session.get(url) - if r.status_code != 200: - _logger.warning("Could not connect to {}, status code {}".format(url, r.status_code)) - _logger.info("Trying to connect to {}".format(url.replace("opendap/", ""))) - r = auth_session.get(url.replace("opendap/", "")) - if r.status_code != 200: - _logger.error("Could not connect to {} to retrieve data, status code {}".format(url, r.status_code)) - raise RuntimeError("HTTP error: <%d>\n" % (r.status_code) + r.text[:4096]) - except requests.ConnectionError as e: - _logger.warning("Cannot connect to {}:".format(url) + str(e)) - r = None - except RuntimeError as e: - _logger.warning("Cannot authenticate to {}. Check credentials. Error was as follows:".format(url) + str(e)) - return r - - -def _infer_SMAP_product_version(product, base_url, auth_session): +def _infer_SMAP_product_version(product, base_url, session): """Helper function to automatically infer the version number of SMAP products in case user did not specify a version, or the version changed @@ -176,20 +134,20 @@ def _infer_SMAP_product_version(product, base_url, auth_session): Name of the SMAP product (e.g. one of SMAP_PRODUCT_DICT.keys()) base_url: str URL to base SMAP product page - auth_session: podpac.core.authentication.EarthDataSession - Authenticated EDS session + session: :class:`requests.Session` + Authenticated EDS session. Generally returned from :class:`SMAPSessionMixin`. """ - r = _get_from_url(base_url, auth_session) + r = _get_from_url(base_url, session=session) if r: m = re.search(product, r.text) return int(r.text[m.end() + 1 : m.end() + 4]) return int(SMAP_PRODUCT_MAP.sel(product=product, attr="default_version").item()) -# NOTE: {rdk} will be substituted for the entry's 'rootdatakey' +# NOTE: {rdk} will be substituted for the entry's 'root_data_key' SMAP_PRODUCT_DICT = { - #'.ver': ['latkey', 'lonkey', 'rootdatakey', 'layerkey' 'default_verison' + #'.ver': ['lat_key', 'lon_key', 'root_data_key', 'layer_key' 'default_verison' "SPL4SMAU": ["cell_lat", "cell_lon", "Analysis_Data_", "{rdk}sm_surface_analysis", 4], "SPL4SMGP": ["cell_lat", "cell_lon", "Geophysical_Data_", "{rdk}sm_surface", 4], "SPL3SMA": ["{rdk}latitude", "{rdk}longitude", "Soil_Moisture_Retrieval_Data_", "{rdk}soil_moisture", 3], @@ -211,7 +169,7 @@ def _infer_SMAP_product_version(product, base_url, auth_session): dims=["product", "attr"], coords={ "product": list(SMAP_PRODUCT_DICT.keys()), - "attr": ["latkey", "lonkey", "rootdatakey", "layerkey", "default_version"], + "attr": ["lat_key", "lon_key", "root_data_key", "layer_key", "default_version"], }, ) @@ -253,136 +211,225 @@ def SMAP_BASE_URL(): return BASE_URL +class SMAPSessionMixin(authentication.RequestsSessionMixin): + """SMAP requests authentication session. + Implements :class:`authentication.RequestsSessionMixin` with hostname specific to SMAP authentication. + Overrides the :meth:`requests.Session.rebuild_auth` method to handle authorization redirect from the Earthdata portal + """ + + hostname = "urs.earthdata.nasa.gov" + auth_required = True + product_url = SMAP_BASE_URL() + + @cached_property + def session(self): + """Requests Session object for making calls to remote `self.hostname` + See https://2.python-requests.org/en/master/api/#sessionapi + + Returns + ------- + :class:requests.Session + Requests Session class with `auth` attribute defined + """ + + s = self._create_session() + + # override `rebuild_auth` method + s.rebuild_auth = self._rebuild_auth + + return s + + def _rebuild_auth(self, prepared_request, response): + """ + Overrides from the library to keep headers when redirected to or from + the NASA auth host. + See https://2.python-requests.org/en/master/api/#requests.Session.rebuild_auth + + Parameters + ---------- + prepared_request : :class:`requests.Request` + See https://2.python-requests.org/en/master/api/#requests.Session.rebuild_auth + response : :class:`requests.Response` + See https://2.python-requests.org/en/master/api/#requests.Session.rebuild_auth + + Returns + ------- + None + """ + headers = prepared_request.headers + url = prepared_request.url + + if "Authorization" in headers: + original_parsed = requests.utils.urlparse(response.request.url) + redirect_parsed = requests.utils.urlparse(url) + + # delete Authorization headers if original and redirect do not match + # is not in product_url_regex + if ( + (original_parsed.hostname != redirect_parsed.hostname) + and redirect_parsed.hostname != self.hostname + and original_parsed.hostname != self.hostname + ): + + # parse product_url for hostname + product_url_hostname = requests.utils.urlparse(self.product_url).hostname + + # make all numbers in product_url_hostname wildcards + product_url_regex = ( + re.compile(re.sub(r"\d", r"\\d", product_url_hostname)) + if product_url_hostname is not None + else None + ) + + # if redirect matches product_url_regex, then allow the headers to stay + if product_url_regex is not None and product_url_regex.match(redirect_parsed.hostname): + pass + else: + del headers["Authorization"] + + return + + +class SMAPCompositor(OrderedCompositor): + """ + + Attributes + ---------- + sources : list + Source nodes, in order of preference. Later sources are only used where earlier sources do not provide data. + source_coordinates : :class:`podpac.Coordinates` + Coordinates that make each source unique. Must the same size as ``sources`` and single-dimensional. + shared_coordinates : :class:`podpac.Coordinates`. + Coordinates that are shared amongst all of the composited sources. + is_source_coordinates_complete : Bool + This flag is used to automatically construct coordinates as on optimization. Default is False. + For example, if the source coordinates could include the year-month-day of the source, but the actual source + also has hour-minute-second information, the source_coordinates is incomplete. + """ + + is_source_coordinates_complete = tl.Bool(False) + shared_coordinates = tl.Instance(Coordinates, allow_none=True, default_value=None) + + def select_sources(self, coordinates): + """Select sources based on requested coordinates, including setting coordinates, if possible. + + Parameters + ---------- + coordinates : :class:`podpac.Coordinates` + Coordinates to evaluate at compositor sources + + Returns + ------- + sources : :class:`np.ndarray` + Array of sources + + Notes + ----- + * If :attr:`source_coordinates` is defined, only sources that intersect the requested coordinates are selected. + * Sets sources :attr:`interpolation`. + * If source coordinates complete, sets sources :attr:`coordinates` as an optimization. + """ + + """ Optimization: . """ + + src_subset = super(SMAPCompositor, self).select_sources(coordinates) + + if self.is_source_coordinates_complete: + coords_subset = list(self.source_coordinates.intersect(coordinates, outer=True).coords.values())[0] + coords_dim = list(self.source_coordinates.dims)[0] + crs = self.source_coordinates.crs + for s, c in zip(src_subset, coords_subset): + nc = merge_dims( + [ + Coordinates(np.atleast_1d(c), dims=[coords_dim], crs=crs, validate_crs=False), + self.shared_coordinates, + ] + ) + s.set_coordinates(nc) + + return src_subset + + @common_doc(COMMON_DOC) -class SMAPSource(pydap_source.PyDAP): +class SMAPSource(SMAPSessionMixin, DiskCacheMixin, PyDAP): """Accesses SMAP data given a specific openDAP URL. This is the base class giving access to SMAP data, and knows how to extract the correct coordinates and data keys for the soil moisture data. Attributes ---------- - auth_class : {auth_class} - auth_session : {auth_session} date_file_url_re : SRE_Pattern Regular expression used to retrieve date from self.source (OpenDAP Url) date_time_file_url_re : SRE_Pattern Regular expression used to retrieve date and time from self.source (OpenDAP Url) - layerkey : str + layer_key : str Key used to retrieve data from OpenDAP dataset. This specifies the key used to retrieve the data nan_vals : list List of values that should be treated as no-data (these are replaced by np.nan) - rootdatakey : str + root_data_key : str String the prepends every or most keys for data in the OpenDAP dataset """ - auth_session = tl.Instance(authentication.EarthDataSession) - auth_class = tl.Type(authentication.EarthDataSession) - # Need to overwrite parent because of recursive definition - outputs = None - - @tl.default("auth_session") - def _auth_session_default(self): - session = self.auth_class(username=self.username, password=self.password, product_url=SMAP_BASE_URL()) - - # check url - try: - session.get(SMAP_BASE_URL()) - except Exception as e: - _logger.warning("Unknown exception: ", e) - return session + layer_key = tl.Unicode().tag(attr=True) + root_data_key = tl.Unicode().tag(attr=True) + nan_vals = [-9999.0] + cache_coordinates = tl.Bool(True) # date_url_re = re.compile('[0-9]{4}\.[0-9]{2}\.[0-9]{2}') date_time_file_url_re = re.compile("[0-9]{8}T[0-9]{6}") date_file_url_re = re.compile("[0-9]{8}") - rootdatakey = tl.Unicode() - - @tl.default("rootdatakey") + @tl.default("root_data_key") def _rootdatakey_default(self): - return SMAP_PRODUCT_MAP.sel(product=self.product, attr="rootdatakey").item() + return SMAP_PRODUCT_MAP.sel(product=self.product, attr="root_data_key").item() - layerkey = tl.Unicode() - - @tl.default("layerkey") + @tl.default("layer_key") def _layerkey_default(self): - return SMAP_PRODUCT_MAP.sel(product=self.product, attr="layerkey").item() - - nan_vals = [-9999.0] + return SMAP_PRODUCT_MAP.sel(product=self.product, attr="layer_key").item() @property def product(self): - """Returns the SMAP product from the OpenDAP Url + """SMAP product from the OpenDAP URL""" - Returns - ------- - str - {product} - """ src = self.source.split("/") return src[src.index("SMAP") + 1].split(".")[0] @property def version(self): - """Returns the SMAP product version from the OpenDAP Url - - Returns - ------- - int - {version} + """SMAP product version from the OpenDAP URL """ src = self.source.split("/") return int(src[src.index("SMAP") + 1].split(".")[1]) - @tl.default("datakey") - def _datakey_default(self): - return self.layerkey.format(rdk=self.rootdatakey) + @property + def data_key(self): + """PyDAP data_key, constructed from the layer_key and root_data_key""" + + return self.layer_key.format(rdk=self.root_data_key) @property - def latkey(self): - """The key used to retrieve the latitude + def lat_key(self): + """OpenDap dataset key for latitude. """ - Returns - ------- - str - OpenDap dataset key for latitude - """ - return SMAP_PRODUCT_MAP.sel(product=self.product, attr="latkey").item().format(rdk=self.rootdatakey) + return SMAP_PRODUCT_MAP.sel(product=self.product, attr="lat_key").item().format(rdk=self.root_data_key) @property - def lonkey(self): - """The key used to retrieve the latitude + def lon_key(self): + """OpenDap dataset key for longitude. """ - Returns - ------- - str - OpenDap dataset key for longitude - """ - return SMAP_PRODUCT_MAP.sel(product=self.product, attr="lonkey").item().format(rdk=self.rootdatakey) + return SMAP_PRODUCT_MAP.sel(product=self.product, attr="lon_key").item().format(rdk=self.root_data_key) - @common_doc(COMMON_DOC) - @cache_func("native.coordinates") - def get_native_coordinates(self): - """{get_native_coordinates} - """ - times = self.get_available_times() - ds = self.dataset - lons = np.array(ds[self.lonkey][:, :]) - lats = np.array(ds[self.latkey][:, :]) - lons[lons == self.nan_vals[0]] = np.nan - lats[lats == self.nan_vals[0]] = np.nan - lons = np.nanmean(lons, axis=0) - lats = np.nanmean(lats, axis=1) - coords = podpac.Coordinates([times, lats, lons], dims=["time", "lat", "lon"]) - return coords + @cached_property + def available_times(self): + """Retrieve the available times from the SMAP file. - def get_available_times(self): - """Retrieve the available times from the SMAP file. This is primarily based on the filename, but some products - have multiple times stored in a single file. + This is primarily based on the filename, but some products have multiple times stored in a single file. Returns ------- np.ndarray(dtype=np.datetime64) Available times in the SMAP source """ + m = self.date_time_file_url_re.search(self.source) if not m: m = self.date_file_url_re.search(self.source) @@ -392,6 +439,19 @@ def get_available_times(self): times = times + np.array([6, 18], "timedelta64[h]") return times + @common_doc(COMMON_DOC) + def get_coordinates(self): + """{get_coordinates} + """ + lons = np.array(self.dataset[self.lon_key][:, :]) + lats = np.array(self.dataset[self.lat_key][:, :]) + lons[lons == self.nan_vals[0]] = np.nan + lats[lats == self.nan_vals[0]] = np.nan + lons = np.nanmean(lons, axis=0) + lats = np.nanmean(lats, axis=1) + coords = Coordinates([self.available_times, lats, lons], dims=["time", "lat", "lon"]) + return coords + @common_doc(COMMON_DOC) def get_data(self, coordinates, coordinates_index): """{get_data} @@ -400,23 +460,23 @@ def get_data(self, coordinates, coordinates_index): s = tuple([slc for d, slc in zip(coordinates.dims, coordinates_index) if "time" not in d]) if "SM_P_" in self.source: d = self.create_output_array(coordinates) - am_key = self.layerkey.format(rdk=self.rootdatakey + "AM") - pm_key = self.layerkey.format(rdk=self.rootdatakey + "PM") + "_pm" + am_key = self.layer_key.format(rdk=self.root_data_key + "AM") + pm_key = self.layer_key.format(rdk=self.root_data_key + "PM") + "_pm" try: - t = self.native_coordinates.coords["time"][0] + t = self.coordinates.coords["time"][0] d.loc[dict(time=t)] = np.array(self.dataset[am_key][s]) except: pass try: - t = self.native_coordinates.coords["time"][1] + t = self.coordinates.coords["time"][1] d.loc[dict(time=t)] = np.array(self.dataset[pm_key][s]) except: pass else: - data = np.array(self.dataset[self.datakey][s]) + data = np.array(self.dataset[self.data_key][s]) d = self.create_output_array(coordinates, data=data.reshape(coordinates.shape)) return d @@ -445,15 +505,14 @@ class SMAPProperties(SMAPSource): 'SMAP_L4_SM_lmc_00000000T000000_Vv{latest_version}.h5') """ + source = tl.Unicode().tag(attr=True) file_url_re = re.compile(r"SMAP.*_[0-9]{8}T[0-9]{6}_.*\.h5") - source = tl.Unicode().tag(readonly=True) - @tl.default("source") def _property_source_default(self): - v = _infer_SMAP_product_version("SPL4SMLM", SMAP_BASE_URL(), self.auth_session) + v = _infer_SMAP_product_version("SPL4SMLM", SMAP_BASE_URL(), self.session) url = SMAP_BASE_URL() + "/SPL4SMLM.%03d/2015.03.31/" % (v) - r = _get_from_url(url, self.auth_session) + r = _get_from_url(url, session=self.session) if not r: return "None" n = self.file_url_re.search(r.text).group() @@ -498,23 +557,21 @@ def _property_source_default(self): ] ).tag(attr=True) - @tl.default("layerkey") + @tl.default("layer_key") def _layerkey_default(self): return "{rdk}" + self.property @common_doc(COMMON_DOC) - @cache_func("native.coordinates") - def get_native_coordinates(self): - """{get_native_coordinates} + def get_coordinates(self): + """{get_coordinates} """ - ds = self.dataset - lons = np.array(ds[self.lonkey][:, :]) - lats = np.array(ds[self.latkey][:, :]) + lons = np.array(self.dataset[self.lon_key][:, :]) + lats = np.array(self.dataset[self.lat_key][:, :]) lons[lons == self.nan_vals[0]] = np.nan lats[lats == self.nan_vals[0]] = np.nan lons = np.nanmean(lons, axis=0) lats = np.nanmean(lats, axis=1) - coords = podpac.Coordinates([lats, lons], dims=["lat", "lon"]) + coords = Coordinates([lats, lons], dims=["lat", "lon"]) return coords @@ -543,16 +600,12 @@ class SMAPWilt(SMAPProperties): @common_doc(COMMON_DOC) -class SMAPDateFolder(podpac.compositor.OrderedCompositor): +class SMAPDateFolder(SMAPSessionMixin, DiskCacheMixin, SMAPCompositor): """Compositor of all the SMAP source urls present in a particular folder which is defined for a particular date Attributes ---------- - auth_class : {auth_class} - auth_session : {auth_session} base_url : {base_url} - cache_native_coordinates : bool, optional - Default is False. If True, the native_coordinates will be cached to disk after being computed the first time date_time_url_re : SRE_Pattern Regular expression used to retrieve the date and time from the filename if file_url_re matches date_url_re : SRE_Pattern @@ -568,54 +621,19 @@ class SMAPDateFolder(podpac.compositor.OrderedCompositor): tile cover? latlon_url_re : SRE_Pattern Regular expression used to find the lat-lon coordinates associated with the file from the file name - layerkey : {layerkey} - password : {password} + layer_key : {layer_key} product : str {product} version : int {version} - username : {username} """ - auth_session = tl.Instance(authentication.EarthDataSession) - auth_class = tl.Type(authentication.EarthDataSession) - username = tl.Unicode(None, allow_none=True) - password = tl.Unicode(None, allow_none=True) - # Need to overwrite parent because of recursive definition - outputs = None - - @tl.validate("source_coordinates") - def _validate_source_coordinates(self, d): - # Need to overwrite parent because of recursive definition - return d["value"] - - @tl.default("cache_ctrl") - def _cache_ctrl_default(self): - # append disk store to default cache_ctrl if not present - default_ctrl = cache.get_default_cache_ctrl() - stores = default_ctrl._cache_stores - if not any(isinstance(store, cache.DiskCacheStore) for store in default_ctrl._cache_stores): - stores.append(cache.DiskCacheStore()) - return cache.CacheCtrl(stores) - - @tl.default("auth_session") - def _auth_session_default(self): - return self.auth_class(username=self.username, password=self.password, product_url=SMAP_BASE_URL()) - base_url = tl.Unicode().tag(attr=True) - - @tl.default("base_url") - def _base_url_default(self): - return SMAP_BASE_URL() - product = tl.Enum(SMAP_PRODUCT_MAP.coords["product"].data.tolist()).tag(attr=True) version = tl.Int(allow_none=True).tag(attr=True) - - @tl.default("version") - def _detect_product_version(self): - return _infer_SMAP_product_version(self.product, self.base_url, self.auth_session) - folder_date = tl.Unicode("").tag(attr=True) + layer_key = tl.Unicode().tag(attr=True) + latlon_delta = tl.Float(default_value=1.5).tag(attr=True) file_url_re = re.compile(r".*_[0-9]{8}T[0-9]{6}_.*\.h5") file_url_re2 = re.compile(r".*_[0-9]{8}_.*\.h5") @@ -623,30 +641,32 @@ def _detect_product_version(self): date_url_re = re.compile(r"[0-9]{8}") latlon_url_re = re.compile(r"[0-9]{3}[E,W][0-9]{2}[N,S]") - latlon_delta = tl.Float(default_value=1.5).tag(attr=True) + # list of attribute names, used by __repr__ and __str__ to display minimal info about the node + _repr_keys = ["product", "folder_date"] - cache_native_coordinates = tl.Bool(False) + @tl.default("base_url") + def _base_url_default(self): + return SMAP_BASE_URL() - layerkey = tl.Unicode() + @tl.default("version") + def _detect_product_version(self): + return _infer_SMAP_product_version(self.product, self.base_url, self.session) - @tl.default("layerkey") + @tl.default("layer_key") def _layerkey_default(self): - return SMAP_PRODUCT_MAP.sel(product=self.product, attr="layerkey").item() - - @tl.observe("layerkey") - def _layerkey_change(self, change): - if change["old"] != change["new"] and change["old"] != "": - for s in self.sources: - s.layerkey = change["new"] + return SMAP_PRODUCT_MAP.sel(product=self.product, attr="layer_key").item() - def __repr__(self): - rep = "{}".format("SMAP") - rep += "\n\tproduct: {}".format(self.product) + @tl.default("shared_coordinates") + def _default_shared_coordinates(self): + """Coordinates that are shared by all files in the folder.""" - return rep + if self.product in SMAP_INCOMPLETE_SOURCE_COORDINATES: + return None + coords = copy.deepcopy(self.sources[0].coordinates) + return coords.drop("time") - @property - def source(self): + @cached_property + def folder_url(self): """URL to OpenDAP dataset folder Returns @@ -656,20 +676,14 @@ def source(self): """ return "/".join([self.base_url, "%s.%03d" % (self.product, self.version), self.folder_date]) - @tl.default("sources") - def sources_default(self): - """SMAPSource objects pointing to URLs of specific SMAP files in the folder + @cached_property + def sources(self): + """SMAPSource objects pointing to URLs of specific SMAP files in the folder""" - Returns - ------- - np.ndarray(dtype=object(SMAPSource)) - Array of SMAPSource instances tied to specific SMAP files - """ # Swapped the try and except blocks. SMAP filenames may change version numbers, which causes cached source to # break. Hence, try to get the new source everytime, unless data is offline, in which case rely on the cache. try: - _, _, sources = self.get_available_coords_sources() - self.put_cache(sources, "sources", overwrite=True) + _, _, sources = self.available_coords_sources except: # No internet or authentication error try: sources = self.get_cache("sources") @@ -678,7 +692,9 @@ def sources_default(self): "Connection or Authentication error, and no disk cache to fall back on for determining sources." ) - b = self.source + "/" + else: + self.put_cache(sources, "sources", overwrite=True) + time_crds = self.source_coordinates["time"] if time_crds.is_monotonic and time_crds.is_uniform and time_crds.size > 1: tol = time_crds.coordinates[1] - time_crds.coordinates[0] @@ -687,21 +703,13 @@ def sources_default(self): tol = tol - tol tol = np.timedelta64(1, dtype=(tol.dtype)) - src_objs = [ - SMAPSource( - source=b + s, - auth_session=self.auth_session, - layerkey=self.layerkey, - interpolation={"method": "nearest", "time_tolerance": tol}, - ) - for s in sources - ] - return np.array(src_objs) + kwargs = {"layer_key": self.layer_key, "interpolation": {"method": "nearest", "time_tolerance": tol}} + return [SMAPSource(source="%s/%s" % (self.folder_url, s), **kwargs) for s in sources] - @tl.default("is_source_coordinates_complete") - def src_crds_complete_default(self): - """Flag use to optimize creation of native_coordinates. If the source_coordinates are complete, - native_coordinates can easily be reconstructed, and same with shared coordinates. + @property + def is_source_coordinates_complete(self): + """Flag use to optimize creation of coordinates. If the source_coordinates are complete, + coordinates can easily be reconstructed, and same with shared coordinates. Returns ------- @@ -710,11 +718,12 @@ def src_crds_complete_default(self): """ return self.product not in SMAP_INCOMPLETE_SOURCE_COORDINATES - def get_source_coordinates(self): - """{source_coordinates} - """ + @cached_property + def source_coordinates(self): + """{source_coordinates}""" + try: - times, latlon, _ = self.get_available_coords_sources() + times, latlon, _ = self.available_coords_sources except: try: return self.get_cache("source.coordinates") @@ -722,30 +731,17 @@ def get_source_coordinates(self): raise NodeException( "Connection or Authentication error, and no disk cache to fall back on for determining sources." ) - - if latlon is not None and latlon.size > 0: - crds = podpac.Coordinates([[times, latlon[:, 0], latlon[:, 1]]], dims=["time_lat_lon"]) else: - crds = podpac.Coordinates([times], dims=["time"]) - self.put_cache(crds, "source.coordinates", overwrite=True) - return crds - - @cache_func("shared.coordinates") - def get_shared_coordinates(self): - """Coordinates that are shared by all files in the folder. - - Returns - ------- - podpac.Coordinates - Coordinates shared by all files in the folder - """ - if self.product in SMAP_INCOMPLETE_SOURCE_COORDINATES: - return None - - coords = copy.deepcopy(self.sources[0].native_coordinates) - return coords.drop("time") + if latlon is not None and latlon.size > 0: + crds = Coordinates([[times, latlon[:, 0], latlon[:, 1]]], dims=["time_lat_lon"]) + else: + crds = Coordinates([times], dims=["time"]) + self.put_cache(crds, "source.coordinates", overwrite=True) + return crds - def get_available_coords_sources(self): + # TODO just return the elements, then return each actual thing separately + @cached_property + def available_coords_sources(self): """Read NSIDC site for available coordinate sources Returns @@ -762,10 +758,9 @@ def get_available_coords_sources(self): RuntimeError If the NSIDC website cannot be accessed """ - url = self.source - r = _get_from_url(url, self.auth_session) + r = _get_from_url(self.folder_url, self.session) if r is None: - _logger.warning("Could not contact {} to retrieve source coordinates".format(url)) + _logger.warning("Could not contact {} to retrieve source coordinates".format(self.folder_url)) return np.array([]), None, np.array([]) soup = bs4.BeautifulSoup(r.text, "lxml") a = soup.find_all("a") @@ -820,124 +815,114 @@ def keys(self): """ return self.sources[0].keys - @property - def base_definition(self): - """ Definition for SMAP node. Sources not required as these are computed. - """ - d = super(podpac.compositor.Compositor, self).base_definition - d["interpolation"] = self.interpolation - return d - @common_doc(COMMON_DOC) -class SMAP(podpac.compositor.OrderedCompositor): +class SMAP(SMAPSessionMixin, DiskCacheMixin, SMAPCompositor): """Compositor of all the SMAPDateFolder's for every available SMAP date. Essentially a compositor of all SMAP data for a particular product. Attributes ---------- - auth_class : {auth_class} - auth_session : {auth_session} base_url : {base_url} date_url_re : SRE_Pattern Regular expression used to extract all folder dates (or folder names) for the particular SMAP product. - layerkey : {layerkey} - password : {password} + layer_key : {layer_key} product : str {product} - username : {username} """ - # Need to overwrite parent because of recursive definition - outputs = None base_url = tl.Unicode().tag(attr=True) + product = tl.Enum(SMAP_PRODUCT_MAP.coords["product"].data.tolist(), default_value="SPL4SMAU").tag(attr=True) + version = tl.Int(allow_none=True).tag(attr=True) + layer_key = tl.Unicode().tag(attr=True) + + date_url_re = re.compile(r"[0-9]{4}\.[0-9]{2}\.[0-9]{2}") - @tl.validate("source_coordinates") - def _validate_source_coordinates(self, d): - # Need to overwrite parent because of recursive definition - return d["value"] + _repr_keys = ["product"] @tl.default("base_url") def _base_url_default(self): return SMAP_BASE_URL() - product = tl.Enum(SMAP_PRODUCT_MAP.coords["product"].data.tolist(), default_value="SPL4SMAU").tag(attr=True) - version = tl.Int(allow_none=True).tag(attr=True) - @tl.default("version") def _detect_product_version(self): - return _infer_SMAP_product_version(self.product, self.base_url, self.auth_session) - - date_url_re = re.compile(r"[0-9]{4}\.[0-9]{2}\.[0-9]{2}") - - auth_session = tl.Instance(authentication.EarthDataSession) - auth_class = tl.Type(authentication.EarthDataSession) - username = tl.Unicode(None, allow_none=True) - password = tl.Unicode(None, allow_none=True) + return _infer_SMAP_product_version(self.product, self.base_url, self.session) - @tl.default("auth_session") - def _auth_session_default(self): - return self.auth_class(username=self.username, password=self.password, product_url=SMAP_BASE_URL()) + @tl.default("layer_key") + def _layerkey_default(self): + return SMAP_PRODUCT_MAP.sel(product=self.product, attr="layer_key").item() - layerkey = tl.Unicode() + @tl.default("shared_coordinates") + def _default_shared_coordinates(self): + """Coordinates that are shared by all files in the SMAP product family. - @tl.default("layerkey") - def _layerkey_default(self): - return SMAP_PRODUCT_MAP.sel(product=self.product, attr="layerkey").item() + Notes + ------ + For example, the gridded SMAP data have the same lat-lon coordinates in every file (global at some resolution), + and the only difference between files is the time coordinate. + This is not true for the SMAP-Sentinel product, in which case this function returns None + """ + if self.product in SMAP_INCOMPLETE_SOURCE_COORDINATES: + return None - @tl.observe("layerkey") - def _layerkey_change(self, change): - if change["old"] != change["new"] and change["old"] != "": - for s in self.sources: - s.layerkey = change["new"] + sample_source = SMAPDateFolder(product=self.product, version=self.version, folder_date=self.available_dates[0]) + return sample_source.shared_coordinates - def __repr__(self): - rep = "{}".format("SMAP") - rep += "\n\tproduct: {}".format(self.product) - rep += "\n\tinterpolation: {}".format(self.interpolation) + @cached_property + def available_dates(self): + """ Available dates in SMAP date format, sorted.""" + url = "/".join([self.base_url, "%s.%03d" % (self.product, self.version)]) + r = _get_from_url(url, self.session) + if r is None: + _logger.warning("Could not contact {} to retrieve source coordinates".format(url)) + return [] + soup = bs4.BeautifulSoup(r.text, "lxml") + matches = [self.date_url_re.match(a.get_text()) for a in soup.find_all("a")] + dates = [m.group() for m in matches if m] + return dates + + @cached_property + def sources(self): + """Array of SMAPDateFolder objects pointing to specific SMAP folders""" + + kwargs = { + "product": self.product, + "version": self.version, + "layer_key": self.layer_key, + "shared_coordinates": self.shared_coordinates, # this is an optimization + } + return [SMAPDateFolder(folder_date=date, **kwargs) for date in self.available_dates] - return rep + @common_doc(COMMON_DOC) + @cached_property + def source_coordinates(self): + """{source_coordinates} + """ + available_times = [np.datetime64(date.replace(".", "-")) for date in self.available_dates] + return Coordinates([available_times], dims=["time"]) @property - def source(self): - """The source is used for a unique name to cache SMAP products. + def base_ref(self): + """Summary Returns ------- - str - The SMAP product name. + TYPE + Description """ - return "%s.%03d" % (self.product, self.version) - - @tl.default("sources") - def sources_default(self): - """SMAPDateFolder objects pointing to specific SMAP folders + return "{0}_{1}".format(self.__class__.__name__, self.product) - Returns - ------- - np.ndarray(dtype=object(SMAPDateFolder)) - Array of SMAPDateFolder instances tied to specific SMAP folders + @property + @common_doc(COMMON_DOC) + def keys(self): + """{keys} """ - dates = self.get_available_times_dates()[1] - src_objs = np.array( - [ - SMAPDateFolder( - product=self.product, - version=self.version, - folder_date=date, - shared_coordinates=self.shared_coordinates, - auth_session=self.auth_session, - layerkey=self.layerkey, - ) - for date in dates - ] - ) - return src_objs + return self.sources[0].keys @common_doc(COMMON_DOC) def find_coordinates(self): """ - {native_coordinates} + {coordinates} Notes ----- @@ -946,78 +931,11 @@ def find_coordinates(self): if self.product in SMAP_IRREGULAR_COORDINATES: raise Exception("Native coordinates too large. Try using get_filename_coordinates_sources().") - shared = self.get_shared_coordinates() - partial_sources = self.get_source_coordinates()["time"].coordinates - complete_source_0 = self.sources[0].get_source_coordinates()["time"].coordinates + partial_sources = self.source_coordinates["time"].coordinates + complete_source_0 = self.sources[0].source_coordinates["time"].coordinates offset = complete_source_0 - partial_sources[0] full_times = (partial_sources[:, None] + offset[None, :]).ravel() - return [merge_dims([podpac.Coordinates([full_times], ["time"]), shared])] - - @common_doc(COMMON_DOC) - def get_source_coordinates(self): - """{source_coordinates} - """ - return podpac.Coordinates([self.get_available_times_dates()[0]], dims=["time"]) - - def get_available_times_dates(self): - """Returns the available folder dates in the SMAP product - - Returns - ------- - np.ndarray - Array of dates in numpy datetime64 format - list - list of dates in SMAP date format - - Raises - ------ - RuntimeError - If the http resource could not be accessed (check Earthdata login credentials) - """ - url = "/".join([self.base_url, "%s.%03d" % (self.product, self.version)]) - r = _get_from_url(url, self.auth_session) - if r is None: - _logger.warning("Could not contact {} to retrieve source coordinates".format(url)) - return np.array([]), [] - soup = bs4.BeautifulSoup(r.text, "lxml") - a = soup.find_all("a") - regex = self.date_url_re - times = [] - dates = [] - for aa in a: - m = regex.match(aa.get_text()) - if m: - times.append(np.datetime64(m.group().replace(".", "-"))) - dates.append(m.group()) - times.sort() - dates.sort() - return np.array(times), dates - - @cache_func("shared.coordinates") - def get_shared_coordinates(self): - """Coordinates that are shared by all files in the SMAP product family. - - Returns - ------- - podpac.Coordinates - Coordinates shared by all files in the SMAP product. - - Notes - ------ - For example, the gridded SMAP data have the same lat-lon coordinates in every file (global at some resolution), - and the only difference between files is the time coordinate. - This is not true for the SMAP-Sentinel product, in which case this function returns None - """ - if self.product in SMAP_INCOMPLETE_SOURCE_COORDINATES: - return None - - coords = SMAPDateFolder( - product=self.product, - version=self.version, - folder_date=self.get_available_times_dates()[1][0], - auth_session=self.auth_session, - ).shared_coordinates - return coords + return [podpac.coordinates.merge_dims([Coordinates([full_times], ["time"]), self.shared_coordinates])] def get_filename_coordinates_sources(self, bounds=None, update_cache=False): """Returns coordinates solely based on the filenames of the sources. This function was motivated by the @@ -1025,18 +943,18 @@ def get_filename_coordinates_sources(self, bounds=None, update_cache=False): Parameters ----------- - bounds: podpac.Coordinates, Optional + bounds: Coordinates, Optional Default is None. Return the coordinates based on filenames of the source only within the specified bounds. When not None, the result is not cached. update_cache: bool, optional Default is False. The results of this call are automatically cached to disk. This function will try to - update the cache if new data arrives. Only set this flag to True to rebuild the entire index locally (which + update the cache if new data arrives. Only set this flag to True to rebuild_auth the entire index locally (which may be needed when version numbers in the filenames change). Returns ------- - podpac.Coordinates + Coordinates Coordinates of all the sources in the product family Container Container that will generate an array of the SMAPSources pointing to unique OpenDAP urls corresponding to @@ -1087,10 +1005,7 @@ def latlonmap(x): # Get CMR data filenames = nasaCMR.search_granule_json( - auth_session=self.auth_session, - entry_map=lambda x: x["producer_granule_id"], - short_name=self.product, - **kwargs, + session=self.session, entry_map=lambda x: x["producer_granule_id"], short_name=self.product, **kwargs ) if not filenames: return Coordinates([]), [], [] @@ -1112,7 +1027,7 @@ def latlonmap(x): return crds, filenames, dates # Create kwargs for making a SMAP source - create_kwargs = {"auth_session": self.auth_session, "layer_key": self.layerkey} + create_kwargs = {"layer_key": self.layer_key} if self.interpolation: create_kwargs["interpolation"] = self.interpolation @@ -1161,64 +1076,23 @@ def latlonmap(x): self.put_cache(crds, "filename.coordinates", overwrite=update_cache) self.put_cache(sources, "filename.sources", overwrite=update_cache) - # Update the auth_session and/or interpolation and/or other keyword arguments in the sources class + # Updates interpolation and/or other keyword arguments in the sources class sources.create_kwargs = create_kwargs return crds, sources - @property - def base_ref(self): - """Summary - - Returns - ------- - TYPE - Description - """ - return "{0}_{1}".format(self.__class__.__name__, self.product) - - @property - def base_definition(self): - """ Definition for SMAP node. Sources not required as these are computed. - """ - d = super(podpac.compositor.Compositor, self).base_definition - d["interpolation"] = self.interpolation - return d - - @property - @common_doc(COMMON_DOC) - def keys(self): - """{keys} - """ - return self.sources[0].keys - -class SMAPBestAvailable(podpac.compositor.OrderedCompositor): +class SMAPBestAvailable(OrderedCompositor): """Compositor of SMAP-Sentinel and the Level 4 SMAP Analysis Update soil moisture """ - @tl.default("sources") - def sources_default(self): - """Orders the compositor of SPL2SMAP_S in front of SPL4SMAU - - Returns - ------- - np.ndarray(dtype=object(SMAP)) - Array of SMAP product sources - """ - src_objs = np.array( - [ - SMAP(interpolation=self.interpolation, product="SPL2SMAP_S"), - SMAP(interpolation=self.interpolation, product="SPL4SMAU"), - ] - ) - return src_objs - - def __repr__(self): - rep = "{}".format("SMAP (Best Available)") - return rep + @cached_property + def sources(self): + """Orders the compositor of SPL2SMAP_S in front of SPL4SMAU. """ - def get_shared_coordinates(self): - return None # NO shared coordiantes + return [ + SMAP(interpolation=self.interpolation, product="SPL2SMAP_S"), + SMAP(interpolation=self.interpolation, product="SPL4SMAU"), + ] class GetSMAPSources(object): @@ -1241,13 +1115,9 @@ def __getitem__(self, slc): source_urls = [base_url + np2smap_date(d)[:10] + "/" + f for d, f in zip(self.dates[slc], self.filenames[slc])] return np.array([SMAPSource(source=s, **self.create_kwargs) for s in source_urls], object)[return_slice] - @property + @cached_property def base_url(self): - if not self._base_url: - self._base_url = SMAPDateFolder( - product=self.product, folder_date="00001122", auth_session=self.create_kwargs["auth_session"] - ).source[:-8] - return self._base_url + return SMAPDateFolder(product=self.product, folder_date="00001122").source[:-8] def __len__(self): return len(self.filenames) @@ -1259,3 +1129,67 @@ def intersect(self, I): dates=[self.dates[i] for i in I], create_kwargs=self.create_kwargs, ) + + +if __name__ == "__main__": + import getpass + from matplotlib import pyplot + import podpac + + logging.basicConfig() + + product = "SPL4SMAU" + interpolation = {"method": "nearest", "params": {"time_tolerance": np.timedelta64(2, "h")}} + + sm = SMAP(product=product, interpolation=interpolation) + + # username = input("Username: ") + # password = getpass.getpass("Password: ") + # sm.set_credentials(username=username, password=password) + + # SMAP info + print(sm) + print("SMAP Definition:", sm.json_pretty) + print( + "SMAP available_dates:", + "%s - %s (%d)" % (sm.available_dates[0], sm.available_dates[1], len(sm.available_dates)), + ) + print("SMAP source_coordinates:", sm.source_coordinates) + print("SMAP shared_coordinates:", sm.shared_coordinates) + print("Sources:", sm.sources[:3], "... (%d)" % len(sm.sources)) + + # sample SMAPDateFolder info + sm_datefolder = sm.sources[0] + print("Sample DateFolder:", sm_datefolder) + print("Sample DateFolder Definition:", sm_datefolder.json_pretty) + print("Sample DateFolder source_coordinates:", sm_datefolder.source_coordinates) + print("Sample DateFolder Sources:", sm_datefolder.sources[:3], "... (%d)" % len(sm_datefolder.sources)) + + # sample SMAPSource info + sm_source = sm_datefolder.sources[0] + print("Sample DAP Source:", sm_source) + print("Sample DAP Source Definition:", sm_source.json_pretty) + print("Sample DAP Native Coordinates:", sm_source.coordinates) + + print("Another Sample DAP Native Coordinates:", sm_datefolder.sources[1].coordinates) + + # eval whole world + c_world = Coordinates( + [podpac.crange(90, -90, -2.0), podpac.crange(-180, 180, 2.0), "2018-05-19T12:00:00"], + dims=["lat", "lon", "time"], + ) + o = sm.eval(c_world) + o.plot(cmap="gist_earth_r") + pyplot.axis("scaled") + + # eval points over time + lat = [45.0, 45.0, 0.0, 45.0] + lon = [-100.0, 20.0, 20.0, 100.0] + c_pts = Coordinates([[lat, lon], podpac.crange("2018-05-15T00", "2018-05-19T00", "3,h")], dims=["lat_lon", "time"]) + + o = sm.eval(c_pts) + # sm.threaded = False + pyplot.plot(ot.time, ot.data.T) + + pyplot.show() + print("Done") diff --git a/podpac/datalib/smap_egi.py b/podpac/datalib/smap_egi.py index 4cdb0259e..967462ef6 100644 --- a/podpac/datalib/smap_egi.py +++ b/podpac/datalib/smap_egi.py @@ -33,11 +33,8 @@ def isnat(a): np.isnat = isnat # Internal dependencies -import podpac -import podpac.datalib -from podpac.core.coordinates import Coordinates +from podpac import Coordinates, UnitsDataArray, cached_property from podpac.datalib import EGI -from podpac.core.units import UnitsDataArray SMAP_PRODUCT_DICT = { #'shortname': ['lat_key', 'lon_key', 'data_key', 'quality_flag', 'default_verison'] @@ -49,44 +46,45 @@ def isnat(a): "/Soil_Moisture_Retrieval_Data/longitude", "/Soil_Moisture_Retrieval_Data/soil_moisture", "/Soil_Moisture_Retrieval_Data/retrieval_qual_flag", - 3, + "003", ], "SPL3SMA": [ "/Soil_Moisture_Retrieval_Data/latitude", "/Soil_Moisture_Retrieval_Data/longitude", "/Soil_Moisture_Retrieval_Data/soil_moisture", "/Soil_Moisture_Retrieval_Data/retrieval_qual_flag", - 3, + "003", ], "SPL3SMP_AM": [ "/Soil_Moisture_Retrieval_Data_AM/latitude", "/Soil_Moisture_Retrieval_Data_AM/longitude", "/Soil_Moisture_Retrieval_Data_AM/soil_moisture", "/Soil_Moisture_Retrieval_Data_AM/retrieval_qual_flag", - 5, + "005", ], "SPL3SMP_PM": [ "/Soil_Moisture_Retrieval_Data_PM/latitude", "/Soil_Moisture_Retrieval_Data_PM/longitude", "/Soil_Moisture_Retrieval_Data_PM/soil_moisture_pm", "/Soil_Moisture_Retrieval_Data_PM/retrieval_qual_flag_pm", - 5, + "005", ], "SPL3SMP_E_AM": [ "/Soil_Moisture_Retrieval_Data_AM/latitude", "/Soil_Moisture_Retrieval_Data_AM/longitude", "/Soil_Moisture_Retrieval_Data_AM/soil_moisture", "/Soil_Moisture_Retrieval_Data_AM/retrieval_qual_flag", - 3, + "003", ], "SPL3SMP_E_PM": [ "/Soil_Moisture_Retrieval_Data_PM/latitude_pm", "/Soil_Moisture_Retrieval_Data_PM/longitude_pm", "/Soil_Moisture_Retrieval_Data_PM/soil_moisture_pm", "/Soil_Moisture_Retrieval_Data_PM/retrieval_qual_flag_pm", - 3, + "003", ], } + SMAP_PRODUCTS = list(SMAP_PRODUCT_DICT.keys()) @@ -112,29 +110,37 @@ class SMAP(EGI): check_quality_flags = tl.Bool(True).tag(attr=True) quality_flag_key = tl.Unicode(allow_none=True).tag(attr=True) - # set default short_name, data_key, lat_key, lon_key, version - @tl.default("short_name") - def _short_name_default(self): + @property + def short_name(self): if "SPL3SMP" in self.product: return self.product.replace("_AM", "").replace("_PM", "") else: return self.product - @tl.default("lat_key") - def _lat_key_default(self): - return SMAP_PRODUCT_DICT[self.product][0] + # pull data_key, lat_key, lon_key, and version from product dict + @cached_property + def _product_data(self): + return SMAP_PRODUCT_DICT[self.product] + + @property + def lat_key(self): + return self._product_data[0] + + @property + def lon_key(self): + return self._product_data[1] - @tl.default("lon_key") - def _lon_key_default(self): - return SMAP_PRODUCT_DICT[self.product][1] + @property + def data_key(self): + return self._product_data[2] - @tl.default("quality_flag_key") - def _quality_flag_key_default(self): - return SMAP_PRODUCT_DICT[self.product][3] + @property + def quality_flag_key(self): + return self._product_data[3] - @tl.default("data_key") - def _data_key_default(self): - return SMAP_PRODUCT_DICT[self.product][2] + @property + def version(self): + return self._product_data[4] @property def coverage(self): @@ -143,10 +149,6 @@ def coverage(self): else: return (self.data_key, self.lat_key, self.lon_key) - @tl.default("version") - def _version_default(self): - return SMAP_PRODUCT_DICT[self.product][4] - def read_file(self, filelike): """Interpret individual SMAP file from EGI zip archive. @@ -260,3 +262,25 @@ def append_file(self, all_data, data): data.lat.data = lat.data return all_data.combine_first(data) + + +if __name__ == "__main__": + import logging + import getpass + from podpac import Coordinates, clinspace + + logger = logging.getLogger() + logger.setLevel(logging.DEBUG) + + username = input("Username:") + password = getpass.getpass("Password:") + + # level 3 access + c = Coordinates( + [clinspace(-82, -81, 10), clinspace(38, 39, 10), clinspace("2015-07-06", "2015-07-08", 10)], + dims=["lon", "lat", "time"], + ) + + node = SMAP(product="SPL3SMP_AM", username=username, password=password) + output = node.eval(c) + print(output) diff --git a/podpac/datalib/terraintiles.py b/podpac/datalib/terraintiles.py index c759554a9..b875f3ed3 100644 --- a/podpac/datalib/terraintiles.py +++ b/podpac/datalib/terraintiles.py @@ -44,13 +44,7 @@ from podpac.data import Rasterio from podpac.compositor import OrderedCompositor from podpac.interpolators import Rasterio as RasterioInterpolator, ScipyGrid, ScipyPoint -from podpac.data import interpolation_trait - -from lazy_import import lazy_module - -# optional imports -s3fs = lazy_module("s3fs") -rasterio = lazy_module("rasterio") +from podpac.data import InterpolationTrait #### # private module attributes @@ -58,7 +52,6 @@ # create log for module _logger = logging.getLogger(__name__) -_s3 = s3fs.S3FileSystem(anon=True) class TerrainTilesSource(Rasterio): @@ -75,15 +68,13 @@ class TerrainTilesSource(Rasterio): rasterio dataset """ - outputs = None - # parameters source = tl.Unicode().tag(readonly=True) # attributes - interpolation = interpolation_trait( + interpolation = InterpolationTrait( default_value={"method": "nearest", "interpolators": [RasterioInterpolator, ScipyGrid, ScipyPoint]} - ).tag(readonly=True) + ).tag(attr=True) @tl.default("crs") def _default_crs(self): @@ -94,45 +85,6 @@ def _default_crs(self): if "normal" in self.source: return "EPSG:3857" - @tl.default("dataset") - def open_dataset(self): - """Opens the data source""" - - cache_key = "fileobj" - with rasterio.MemoryFile() as f: - - # load data from cache - if self.cache_ctrl and self.has_cache(key=cache_key): - _logger.debug("Retrieving terrain tile {} from cache'".format(self.source)) - data = self.get_cache(key=cache_key) - f.write(data) - - else: - - # try finding local file first - try: - with open(self.source, "rb") as localfile: - data = localfile.read() - - # download and put in cache - except FileNotFoundError: - _logger.info("Downloading S3 fileobj: {}".format(self.source)) - with _s3.open(self.source, "rb") as s3file: - data = s3file.read() - - # write to memory file - f.write(data) - - # put data in the cache - _logger.debug("Caching terrain tile {} in key 'fileobj'".format(self.source)) - self.cache_ctrl # confirm this is initialized - self.put_cache(data, key=cache_key) - - f.seek(0) - dataset = f.open() - - return dataset - def get_data(self, coordinates, coordinates_index): data = super(TerrainTilesSource, self).get_data(coordinates, coordinates_index) data.data[data.data < 0] = np.nan @@ -161,7 +113,7 @@ def download(self, path="terraintiles"): # download the file _logger.debug("Downloading terrain tile {} to filepath: {}".format(self.source, filepath)) - _s3.get(self.source, filepath) + self.s3.get(self.source, filepath) class TerrainTiles(OrderedCompositor): @@ -195,8 +147,6 @@ class TerrainTiles(OrderedCompositor): Defaults to 'elevation-tiles-prod' """ - outputs = None - # parameters zoom = tl.Int(default_value=6).tag(attr=True) tile_format = tl.Enum(["geotiff", "terrarium", "normal"], default_value="geotiff").tag(attr=True) @@ -206,17 +156,6 @@ class TerrainTiles(OrderedCompositor): def _default_sources(self): return np.array([]) - @property - def source(self): - """ - S3 Bucket source of TerrainTiles - - Returns - ------- - str - """ - return self.bucket - def select_sources(self, coordinates): # get all the tile sources for the requested zoom level and coordinates sources = get_tile_urls(self.tile_format, self.zoom, coordinates) @@ -244,7 +183,7 @@ def download(self, path="terraintiles"): raise ValueError("No terrain tile sources selected. Evaluate node at coordinates to select sources.") def _create_source(self, source): - return TerrainTilesSource(source="{}/{}".format(self.bucket, source)) + return TerrainTilesSource(source="s3://{}/{}".format(self.bucket, source), cache_ctrl=self.cache_ctrl) ############ @@ -480,3 +419,26 @@ def _mercator_to_tilespace(xm, ym, zoom): y = int(tiles * (np.pi - ym) / diameter) return x, y + + +if __name__ == "__main__": + from podpac import Coordinates, clinspace + + c = Coordinates([clinspace(40, 43, 1000), clinspace(-76, -72, 1000)], dims=["lat", "lon"]) + + print("TerrainTiles") + node = TerrainTiles(tile_format="geotiff", zoom=8) + output = node.eval(c) + print(output) + + print("TerrainTiles cached") + node = TerrainTiles(tile_format="geotiff", zoom=8, cache_ctrl=["ram", "disk"]) + output = node.eval(c) + print(output) + + # tile urls + print("get tile urls") + print(np.array(get_tile_urls("geotiff", 1))) + print(np.array(get_tile_urls("geotiff", 9, coordinates=c))) + + print("done") diff --git a/podpac/datalib/test/test_smap.py b/podpac/datalib/test/test_smap.py new file mode 100644 index 000000000..64c3bbba3 --- /dev/null +++ b/podpac/datalib/test/test_smap.py @@ -0,0 +1,80 @@ +import pytest + +import requests +import traitlets as tl + +import podpac.datalib +from podpac.datalib import smap + +from podpac import settings + +# dummy class mixing in custom Earthdata requests Session +class SomeSmapNode(smap.SMAPSessionMixin): + pass + + +class TestSMAPSessionMixin(object): + url = "urs.earthdata.nasa.gov" + + def test_hostname(self): + node = SomeSmapNode() + assert node.hostname == self.url + + def test_auth_required(self): + # make sure auth is deleted from setttings, if it was already there + + # auth required + with settings: + if "username@urs.earthdata.nasa.gov" in settings: + del settings["username@urs.earthdata.nasa.gov"] + + if "password@urs.earthdata.nasa.gov" in settings: + del settings["password@urs.earthdata.nasa.gov"] + + node = SomeSmapNode() + + # throw auth error + with pytest.raises(ValueError, match="username"): + node.session + + node.set_credentials(username="testuser", password="testpass") + + assert node.session + assert node.session.auth == ("testuser", "testpass") + assert isinstance(node.session, requests.Session) + + def test_set_credentials(self): + with settings: + node = SomeSmapNode() + node.set_credentials(username="testuser", password="testpass") + + assert settings["username@{}".format(self.url)] == "testuser" + assert settings["password@{}".format(self.url)] == "testpass" + + def test_session(self): + with settings: + + node = SomeSmapNode() + node.set_credentials(username="testuser", password="testpass") + + assert node.session + assert node.session.auth == ("testuser", "testpass") + assert isinstance(node.session, requests.Session) + + def test_earth_data_session_rebuild_auth(self): + class Dum(object): + pass + + with settings: + node = SomeSmapNode() + node.set_credentials(username="testuser", password="testpass") + + prepared_request = Dum() + prepared_request.headers = {"Authorization": 0} + prepared_request.url = "https://example.com" + + response = Dum() + response.request = Dum() + response.request.url = "https://example2.com" + + node.session.rebuild_auth(prepared_request, response) diff --git a/podpac/datalib/weathercitizen.py b/podpac/datalib/weathercitizen.py new file mode 100644 index 000000000..52d231be6 --- /dev/null +++ b/podpac/datalib/weathercitizen.py @@ -0,0 +1,685 @@ +""" +Weather Citizen + +Crowd sourced environmental observations from mobile devices (https://weathercitizen.org) + +- Documentation: https://weathercitizen.org/docs +- API: https://api.weathercitizen.org + +Requires + +- requests: `pip install requests` +- pandas: `pip install pandas` + +Optionally: + +- read_protobuf: `pip install read-protobuf` - decodes sensor burst media files +""" + +import json +from datetime import datetime +from datetime import timedelta +import logging +from copy import deepcopy + +import traitlets as tl +import pandas as pd +import numpy as np +import requests + +from podpac.data import DataSource +from podpac.core.data.datasource import COMMON_DATA_DOC +from podpac.core.utils import common_doc, trait_is_defined +from podpac.core.coordinates import Coordinates, UniformCoordinates1d, ArrayCoordinates1d, StackedCoordinates + + +URL = "https://api.weathercitizen.org/" +DATE_FORMAT = "%Y-%m-%dT%H:%M:%S.%fZ" # always UTC (ISO 8601 / RFC 3339 format) + +# create log for module +_logger = logging.getLogger(__name__) + + +class WeatherCitizen(DataSource): + """DataSource to handle WeatherCitizen data + + Attributes + ---------- + source : str + Collection (database) to pull data from. + Defaults to "geosensors" which is the primary data collection + data_key : str, int + Data key of interest, default "properties.pressure" + uuid : str, list(str), options + String or list of strings to filter data by uuid + device : str, list(str), ObjectId, list(ObjectId), optional + String or list of strings to filter data by device object id + version : string, list(str), optional + String or list of strings to filter data to filter data by WeatherCitizen version + query : dict, optional + Arbitrary pymongo query to apply to data. + Note that certain fields in this query may be overriden if other keyword arguments are specified + verbose : bool, optional + Display log messages or progress + """ + + source = tl.Unicode(allow_none=True, default_value="geosensors") + data_key = tl.Unicode(allow_none=True, default_value="properties.pressure").tag(attr=True) + uuid = tl.Unicode(allow_none=True, default_value=None).tag(attr=True) + device = tl.Unicode(allow_none=True, default_value=None).tag(attr=True) + version = tl.Unicode(allow_none=True, default_value=None).tag(attr=True) + query = tl.Unicode(allow_none=True, default_value=None).tag(attr=True) + verbose = tl.Bool(allow_none=True, default_value=True).tag(attr=True) + override_limit = tl.Bool(allow_none=True, default_value=False).tag(attr=True) + + @common_doc(COMMON_DATA_DOC) + def get_coordinates(self): + """{get_coordinates} + """ + + # TODO: how to limit data retrieval for large queries? + + # query parameters + start_time = datetime(2016, 1, 1, 1, 0, 0) # before WeatherCitizen existed + projection = {"properties.time": 1, "geometry.coordinates": 1} + + # make sure data_key exists in dataset + query = {self.data_key: {"$exists": True}} + + # handle if the user specifies and query and the data_key is already in that query + if self.query is not None and self.data_key in self.query: + query = deepcopy(self.query) + query[self.data_key]["$exists"] = True + + # check the length of the matched items + length = get( + collection=self.source, + start_time=start_time, + uuid=self.uuid, + device=self.device, + version=self.version, + query=query, + projection=projection, + verbose=self.verbose, + return_length=True, + ) + + # add some kind of stop on querying above a certain length? + if length > 10000 and not self.override_limit: + raise ValueError( + "More than {} data points match this WeatherCitizen query. Please reduce the scope of your query.".format( + length + ) + ) + + items = get( + collection=self.source, + start_time=start_time, + uuid=self.uuid, + device=self.device, + version=self.version, + query=query, + projection=projection, + verbose=self.verbose, + ) + + lat = [item["geometry"]["coordinates"][1] for item in items] + lon = [item["geometry"]["coordinates"][0] for item in items] + time = [item["properties"]["time"] for item in items] + + return Coordinates([[lat, lon, time]], dims=["lat,lon,time"]) + + @common_doc(COMMON_DATA_DOC) + def get_data(self, coordinates, coordinates_index): + """{get_data} + """ + + # TODO: how to limit data retrieval for large queries? + + # default coordinate bounds for queries + time_bounds = [datetime(2016, 1, 1, 1, 0, 0), None] # before WeatherCitizen existed + lat_bounds = [-90, 90] + lon_bounds = [-180, 180] + + # override bounds + if "time" in coordinates.udims: + time_bounds = coordinates["time"].bounds + if "lat" in coordinates.udims: + lat_bounds = coordinates["lat"].bounds + if "lon" in coordinates.udims: + lon_bounds = coordinates["lon"].bounds + + box = [[lon_bounds[0], lat_bounds[0]], [lon_bounds[1], lat_bounds[1]]] + + # make sure data_key exists in dataset + query = {self.data_key: {"$exists": True}} + + # handle if the user specifies and query and the data_key is already in that query + if self.query is not None and self.data_key in self.query: + query = deepcopy(self.query) + query[self.data_key]["$exists"] = True + + # only project data key + projection = {self.data_key: 1} + + # check the length of the matched items + length = get( + collection=self.source, + start_time=time_bounds[0], + end_time=time_bounds[1], + box=box, + uuid=self.uuid, + device=self.device, + version=self.version, + query=query, + projection=projection, + verbose=self.verbose, + return_length=True, + ) + + # add some kind of stop on querying above a certain length? + if length > 10000 and not self.override_limit: + raise ValueError( + "More than {} data points match this WeatherCitizen query. Please reduce the scope of your query.".format( + length + ) + ) + + items = get( + collection=self.source, + start_time=time_bounds[0], + end_time=time_bounds[1], + box=box, + uuid=self.uuid, + device=self.device, + version=self.version, + query=query, + projection=projection, + verbose=self.verbose, + ) + + data = np.array([item[self.data_key] for item in items]) + + return self.create_output_array(coordinates, data=data) + + +############## +# Standalone functions +############## +def get( + collection="geosensors", + start_time=None, + end_time=None, + box=None, + near=None, + uuid=None, + device=None, + version=None, + query=None, + projection=None, + verbose=False, + dry_run=False, + return_length=False, +): + """Get documents from the server for devices in a timerange + + Parameters + ---------- + collection : str, list(str) + Collection(s) to query + start_time : str, datetime, optional + String or datetime for start of timerange (>=). + Defaults to 1 hour ago. + This input must be compatible with pandas `pd.to_datetime(start_time, utc=True)` + Input assumes UTC by default, but will recognize timezone string EDT, UTC, etc. For example "2019-09-01 08:00 EDT" + end_time : str, datetime, optional + Same as `start_time` but specifies end of time range (<). + Defaults to now. + box : list(list(float)), optional + Geo bounding box described as 2-d array of bottom-left and top-right corners. + If specified, `near` will be ignored. + Contents: [[ , (bottom left coordinates) ], [ , (upper right coordinates) ]] + For example: [[-83, 36], [-81, 34]] + near : tuple([float, float], int), optional + Geo bounding box described as 2-d near with a center point and a radius (km) from center point. + This input will be ignored if box is defined. + Contents: ([, ], ) + For example: ([-72.544655, 40.932559], 16000) + uuid : str, list(str), options + String or list of strings to filter data by uuid + device : str, list(str), ObjectId, list(ObjectId), optional + String or list of strings to filter data by device object id + version : string, list(str), optional + String or list of strings to filter data to filter data by WeatherCitizen version + query : dict, optional + Arbitrary pymongo query to apply to data. + Note that certain fields in this query may be overriden if other keyword arguments are specified + projection: dict, optional + Specify what fields should or should not be returned. + Dict keys are field names. + Dict values should be set to 1 to include field (and exclude all others) or set to 0 to exclude field and include all others + verbose : bool, optional + Display log messages or progress + dry_run : bool, optional + Return urls of queries instead of the actual query. + Returns a list of str with urls for each collections. + Defaults to False. + return_length : bool, optional + Return length of the documents that match the query + + Returns + ------- + list + List of items from server matching query. + If `dry_run` is True, returns a list or url strings for query. + """ + + # always make collection a list + if isinstance(collection, str): + collection = [collection] + + # get query string for each collection in list + query_strs = [ + _build_query( + collection=coll, + start_time=start_time, + end_time=end_time, + box=box, + near=near, + uuid=uuid, + device=device, + version=version, + query=query, + projection=projection, + ) + for coll in collection + ] + + # dry run + if dry_run: + return query_strs + + if verbose: + print("Querying WeatherCitizen API") + + # only return the length of the matched documents + if return_length: + length = 0 + for query_str in query_strs: + length += _get(query_str, verbose=verbose, return_length=return_length) + + if verbose: + print("Returned {} records".format(length)) + + return length + + # start query at page 0 with no items + # iterate through collections aggregating items + items = [] + for query_str in query_strs: + items += _get(query_str, verbose=verbose) + + if verbose: + print("\r") + print("Downloaded {} records".format(len(items))) + + return items + + +def get_record(collection, obj_id, url=URL): + """Get a single record from a collection by obj_id + + Parameters + ---------- + collection : str + Collection name + obj_id : str + Object id + """ + + # check url + if url[-1] != "/": + url = "{}/".format(url) + + # query the server + r = requests.get(url + collection + "/" + obj_id) + + if r.status_code != 200: + raise ValueError("Failed to query the server with status {}.\n\nResponse:\n {}".format(r.status_code, r.text)) + + return r.json() + + +def get_file(media, save=False, output_path=None): + """Get media file + + Parameters + ---------- + media : str, dict + Media record or media record object id in the media or geomedia collections. + save : bool, optional + Save to file + output_path : None, optional + If save is True, output the file to different file path + + Returns + ------- + bytes + If output_path is None, returns raw file content as bytes + + Raises + ------ + ValueError + Description + """ + + if isinstance(media, str): + media_id = media + elif isinstance(media, dict): + media_id = media["_id"] + + try: + record = get_record("media", media_id) + except ValueError: + try: + record = get_record("geomedia", media_id) + + except ValueError: + raise ValueError("Media id {} not found in the database".format(media_id)) + + # get file + r = requests.get(record["file"]["url"]) + + if r.status_code != 200: + raise ValueError( + "Failed to download binary data with status code {}.\n\nResponse:\n {}".format(r.status_code, r.text) + ) + + # save to file if output_path is not None + if save: + if output_path is None: + output_path = record["properties"]["filename"] + with open(output_path, "wb") as f: + f.write(r.content) + else: + return r.content + + +def read_sensorburst(media): + """Download and read sensorburst records. + + Requires: + - read-protobuf: `pip install read-protobuf` + - sensorburst_pb2: Download from https://api.weathercitizen.org/static/sensorburst_pb2.py + - Once downloaded, put this file in the directory as your analysis + + Parameters + ---------- + media : str, dict, list of str, list of dict + Media record(s) or media record object id(s) in the media or geomedia collections. + + Returns + ------- + pd.DataFrame + Returns pandas dataframe of records + """ + + try: + from read_protobuf import read_protobuf + except ImportError: + raise ImportError( + "Reading sensorburst requires `read_protobuf` module. Install using `pip install read-protobuf`." + ) + + # import sensorburst definition + try: + from podpac.datalib import weathercitizen_sensorburst_pb2 as sensorburst_pb2 + except ImportError: + try: + import sensorburst_pb2 + except ImportError: + raise ImportError( + "Processing WeatherCitizen protobuf requires `sensorburst_pb2.py` in the current working directory. Download from https://api.weathercitizen.org/static/sensorburst_pb2.py." + ) + + if isinstance(media, (str, dict)): + media = [media] + + # get pb content + pbs = [get_file(m) for m in media] + + # initialize protobuf object + Burst = sensorburst_pb2.Burst() + + # get the first dataframe + df = read_protobuf(pbs[0], Burst) + + # append later dataframes + if len(pbs) > 1: + for pb in pbs[1:]: + df = df.append(read_protobuf(pb, Burst), sort=False) + + return df + + +def to_dataframe(items): + """Create normalized dataframe from records + + Parameters + ---------- + items : list of dict + Record items returned from `get()` + """ + df = pd.json_normalize(items) + + # Convert geometry.coordinates to lat and lon + df["lat"] = df["geometry.coordinates"].apply(lambda coord: coord[1] if coord and coord is not np.nan else None) + df["lon"] = df["geometry.coordinates"].apply(lambda coord: coord[0] if coord and coord is not np.nan else None) + df = df.drop(["geometry.coordinates"], axis=1) + + # break up all the arrays so the data is easier to use + arrays = [ + "properties.accelerometer", + "properties.gravity", + "properties.gyroscope", + "properties.linear_acceleration", + "properties.magnetic_field", + "properties.orientation", + "properties.rotation_vector", + ] + + for col in arrays: + df[col + "_0"] = df[col].apply(lambda val: val[0] if val and val is not np.nan else None) + df[col + "_1"] = df[col].apply(lambda val: val[1] if val and val is not np.nan else None) + df[col + "_2"] = df[col].apply(lambda val: val[2] if val and val is not np.nan else None) + + df = df.drop([col], axis=1) + + return df + + +def to_csv(items, filename="weathercitizen-data.csv"): + """Convert items to CSV output + + Parameters + ---------- + items : list of dict + Record items returned from `get()` + """ + + df = to_dataframe(items) + + df.to_csv(filename) + + +def update_progress(current, total): + """ + Parameters + ---------- + current : int, float + current number + total : int, floar + total number + """ + + if total == 0: + return + + progress = float(current / total) + bar_length = 20 + block = int(round(bar_length * progress)) + text = "Progress: |{0}| [{1} / {2}]".format("#" * block + " " * (bar_length - block), current, total) + + print("\r", text, end="") + + +def _build_query( + collection="geosensors", + start_time=None, + end_time=None, + box=None, + near=None, + uuid=None, + device=None, + version=None, + query=None, + projection=None, +): + """Build a query string for a single collection. + See :func:`get` for type definitions of each input + + Returns + ------- + string + query string + """ + + if query is None: + query = {} + + # filter by time + # default to 1 hour ago + one_hour_ago = (datetime.utcnow() - timedelta(hours=1)).strftime(DATE_FORMAT) + if start_time is not None: + start_time = pd.to_datetime(start_time, utc=True, infer_datetime_format=True).strftime(DATE_FORMAT) + query["properties.time"] = {"$gte": start_time} + else: + query["properties.time"] = {"$gte": one_hour_ago} + + # default to now + if end_time is not None: + end_time = pd.to_datetime(end_time, utc=True, infer_datetime_format=True).strftime(DATE_FORMAT) + query["properties.time"]["$lte"] = end_time + + # geo bounding box + if box is not None: + if len(box) != 2: + raise ValueError("box parameter must be a list of length 2") + + query["geometry"] = {"$geoWithin": {"$box": box}} + + # geo bounding circle + if near is not None: + if len(near) != 2 or not isinstance(near, tuple): + raise ValueError("near parameter must be a tuple of length 2") + + query["geometry"] = {"$near": {"$geometry": {"type": "Point", "coordinates": near[0]}, "$maxDistance": near[1]}} + + # specify uuid + if uuid is not None: + if isinstance(uuid, str): + query["properties.uuid"] = uuid + elif isinstance(uuid, list): + query["properties.uuid"] = {"$in": uuid} + + # specify device + if device is not None: + if isinstance(device, str): + query["properties.device"] = device + elif isinstance(device, list): + query["properties.device"] = {"$in": device} + + # specify version + if version is not None: + if isinstance(version, str): + query["version"] = version + elif isinstance(version, list): + query["version"] = {"$in": version} + + # add collection to query string and handle projection + if projection is not None: + query_str = "{}?where={}&projection={}".format(collection, json.dumps(query), json.dumps(projection)) + else: + query_str = "{}?where={}".format(collection, json.dumps(query)) + + return query_str + + +def _get(query, items=None, url=URL, verbose=False, return_length=False): + """Internal method to query API. + See `get` for interface. + + Parameters + ---------- + query : dict, str + query dict or string + if dict, it will be converted into a string with json.dumps() + items : list, optional + aggregated items as this method is recursively called. Defaults to []. + url : str, optional + API url. Defaults to module URL. + verbose : bool, optional + Display log messages or progress + return_length : bool, optional + Return length of the documents that match the query + + Returns + ------- + list + + Raises + ------ + ValueError + Description + """ + + # if items are none, set to [] + if items is None: + items = [] + + # check url + if url[-1] != "/": + url = "{}/".format(url) + + # query the server + r = requests.get(url + query) + + if r.status_code != 200: + raise ValueError("Failed to query the server with status {}.\n\nResponse:\n {}".format(r.status_code, r.text)) + + # get json out of response + resp = r.json() + + # return length only if requested + if return_length: + return resp["_meta"]["total"] + + # return documents + if len(resp["_items"]): + + # show progress + if verbose: + current_page = resp["_meta"]["page"] + total_pages = round(resp["_meta"]["total"] / resp["_meta"]["max_results"]) + update_progress(current_page, total_pages) + + # append items + items += resp["_items"] + + # get next set, if in links + if "_links" in resp and "next" in resp["_links"]: + return _get(resp["_links"]["next"]["href"], items=items) + else: + return items + else: + return items diff --git a/podpac/datalib/weathercitizen_sensorburst_pb2.py b/podpac/datalib/weathercitizen_sensorburst_pb2.py new file mode 100644 index 000000000..94d4f09fa --- /dev/null +++ b/podpac/datalib/weathercitizen_sensorburst_pb2.py @@ -0,0 +1,585 @@ +# -*- coding: utf-8 -*- +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: sensorburst.proto + +import sys + +_b = sys.version_info[0] < 3 and (lambda x: x) or (lambda x: x.encode("latin1")) +from google.protobuf import descriptor as _descriptor +from google.protobuf import message as _message +from google.protobuf import reflection as _reflection +from google.protobuf import symbol_database as _symbol_database + +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + +DESCRIPTOR = _descriptor.FileDescriptor( + name="sensorburst.proto", + package="sensorburst", + syntax="proto3", + serialized_options=_b("H\003"), + serialized_pb=_b( + '\n\x11sensorburst.proto\x12\x0bsensorburst"\xbf\x04\n\x06Record\x12\x0c\n\x04time\x18\x01 \x01(\x03\x12\x0c\n\x04long\x18\x02 \x01(\x02\x12\x0b\n\x03lat\x18\x03 \x01(\x02\x12\x10\n\x08\x61ltitude\x18\x04 \x01(\x02\x12\x13\n\x0btemperature\x18\x05 \x01(\x02\x12\x10\n\x08pressure\x18\x06 \x01(\x02\x12\r\n\x05light\x18\x07 \x01(\x02\x12\x11\n\tproximity\x18\x08 \x01(\x05\x12\x17\n\x0f\x61\x63\x63\x65lerometer_x\x18\t \x01(\x02\x12\x17\n\x0f\x61\x63\x63\x65lerometer_y\x18\n \x01(\x02\x12\x17\n\x0f\x61\x63\x63\x65lerometer_z\x18\x0b \x01(\x02\x12\x1d\n\x15linear_acceleration_x\x18\x0c \x01(\x02\x12\x1d\n\x15linear_acceleration_y\x18\r \x01(\x02\x12\x1d\n\x15linear_acceleration_z\x18\x0e \x01(\x02\x12\x15\n\rorientation_x\x18\x0f \x01(\x02\x12\x15\n\rorientation_y\x18\x10 \x01(\x02\x12\x15\n\rorientation_z\x18\x11 \x01(\x02\x12\x18\n\x10magnetic_field_x\x18\x12 \x01(\x02\x12\x18\n\x10magnetic_field_y\x18\x13 \x01(\x02\x12\x18\n\x10magnetic_field_z\x18\x14 \x01(\x02\x12\x13\n\x0bgyroscope_x\x18\x15 \x01(\x02\x12\x13\n\x0bgyroscope_y\x18\x16 \x01(\x02\x12\x13\n\x0bgyroscope_z\x18\x17 \x01(\x02\x12\x11\n\tgravity_x\x18\x18 \x01(\x02\x12\x11\n\tgravity_y\x18\x19 \x01(\x02\x12\x11\n\tgravity_z\x18\x1a \x01(\x02"-\n\x05\x42urst\x12$\n\x07records\x18\x01 \x03(\x0b\x32\x13.sensorburst.RecordB\x02H\x03\x62\x06proto3' + ), +) + + +_RECORD = _descriptor.Descriptor( + name="Record", + full_name="sensorburst.Record", + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name="time", + full_name="sensorburst.Record.time", + index=0, + number=1, + type=3, + cpp_type=2, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="long", + full_name="sensorburst.Record.long", + index=1, + number=2, + type=2, + cpp_type=6, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="lat", + full_name="sensorburst.Record.lat", + index=2, + number=3, + type=2, + cpp_type=6, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="altitude", + full_name="sensorburst.Record.altitude", + index=3, + number=4, + type=2, + cpp_type=6, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="temperature", + full_name="sensorburst.Record.temperature", + index=4, + number=5, + type=2, + cpp_type=6, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="pressure", + full_name="sensorburst.Record.pressure", + index=5, + number=6, + type=2, + cpp_type=6, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="light", + full_name="sensorburst.Record.light", + index=6, + number=7, + type=2, + cpp_type=6, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="proximity", + full_name="sensorburst.Record.proximity", + index=7, + number=8, + type=5, + cpp_type=1, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="accelerometer_x", + full_name="sensorburst.Record.accelerometer_x", + index=8, + number=9, + type=2, + cpp_type=6, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="accelerometer_y", + full_name="sensorburst.Record.accelerometer_y", + index=9, + number=10, + type=2, + cpp_type=6, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="accelerometer_z", + full_name="sensorburst.Record.accelerometer_z", + index=10, + number=11, + type=2, + cpp_type=6, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="linear_acceleration_x", + full_name="sensorburst.Record.linear_acceleration_x", + index=11, + number=12, + type=2, + cpp_type=6, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="linear_acceleration_y", + full_name="sensorburst.Record.linear_acceleration_y", + index=12, + number=13, + type=2, + cpp_type=6, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="linear_acceleration_z", + full_name="sensorburst.Record.linear_acceleration_z", + index=13, + number=14, + type=2, + cpp_type=6, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="orientation_x", + full_name="sensorburst.Record.orientation_x", + index=14, + number=15, + type=2, + cpp_type=6, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="orientation_y", + full_name="sensorburst.Record.orientation_y", + index=15, + number=16, + type=2, + cpp_type=6, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="orientation_z", + full_name="sensorburst.Record.orientation_z", + index=16, + number=17, + type=2, + cpp_type=6, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="magnetic_field_x", + full_name="sensorburst.Record.magnetic_field_x", + index=17, + number=18, + type=2, + cpp_type=6, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="magnetic_field_y", + full_name="sensorburst.Record.magnetic_field_y", + index=18, + number=19, + type=2, + cpp_type=6, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="magnetic_field_z", + full_name="sensorburst.Record.magnetic_field_z", + index=19, + number=20, + type=2, + cpp_type=6, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="gyroscope_x", + full_name="sensorburst.Record.gyroscope_x", + index=20, + number=21, + type=2, + cpp_type=6, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="gyroscope_y", + full_name="sensorburst.Record.gyroscope_y", + index=21, + number=22, + type=2, + cpp_type=6, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="gyroscope_z", + full_name="sensorburst.Record.gyroscope_z", + index=22, + number=23, + type=2, + cpp_type=6, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="gravity_x", + full_name="sensorburst.Record.gravity_x", + index=23, + number=24, + type=2, + cpp_type=6, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="gravity_y", + full_name="sensorburst.Record.gravity_y", + index=24, + number=25, + type=2, + cpp_type=6, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="gravity_z", + full_name="sensorburst.Record.gravity_z", + index=25, + number=26, + type=2, + cpp_type=6, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + ], + extensions=[], + nested_types=[], + enum_types=[], + serialized_options=None, + is_extendable=False, + syntax="proto3", + extension_ranges=[], + oneofs=[], + serialized_start=35, + serialized_end=610, +) + + +_BURST = _descriptor.Descriptor( + name="Burst", + full_name="sensorburst.Burst", + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name="records", + full_name="sensorburst.Burst.records", + index=0, + number=1, + type=11, + cpp_type=10, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ) + ], + extensions=[], + nested_types=[], + enum_types=[], + serialized_options=None, + is_extendable=False, + syntax="proto3", + extension_ranges=[], + oneofs=[], + serialized_start=612, + serialized_end=657, +) + +_BURST.fields_by_name["records"].message_type = _RECORD +DESCRIPTOR.message_types_by_name["Record"] = _RECORD +DESCRIPTOR.message_types_by_name["Burst"] = _BURST +_sym_db.RegisterFileDescriptor(DESCRIPTOR) + +Record = _reflection.GeneratedProtocolMessageType( + "Record", + (_message.Message,), + { + "DESCRIPTOR": _RECORD, + "__module__": "sensorburst_pb2" + # @@protoc_insertion_point(class_scope:sensorburst.Record) + }, +) +_sym_db.RegisterMessage(Record) + +Burst = _reflection.GeneratedProtocolMessageType( + "Burst", + (_message.Message,), + { + "DESCRIPTOR": _BURST, + "__module__": "sensorburst_pb2" + # @@protoc_insertion_point(class_scope:sensorburst.Burst) + }, +) +_sym_db.RegisterMessage(Burst) + + +DESCRIPTOR._options = None +# @@protoc_insertion_point(module_scope) diff --git a/podpac/interpolators.py b/podpac/interpolators.py index 24d4ac20d..27029e4af 100644 --- a/podpac/interpolators.py +++ b/podpac/interpolators.py @@ -5,5 +5,5 @@ # REMINDER: update api docs (doc/source/user/api.rst) to reflect changes to this file -from podpac.core.data.interpolator import Interpolator -from podpac.core.data.interpolators import NearestNeighbor, NearestPreview, Rasterio, ScipyGrid, ScipyPoint +from podpac.core.interpolation.interpolator import Interpolator +from podpac.core.interpolation.interpolators import NearestNeighbor, NearestPreview, Rasterio, ScipyGrid, ScipyPoint diff --git a/podpac/managers.py b/podpac/managers.py index de07bc807..68da6c2f0 100644 --- a/podpac/managers.py +++ b/podpac/managers.py @@ -6,3 +6,5 @@ from podpac.core.managers import aws from podpac.core.managers.aws import Lambda +from podpac.core.managers.parallel import Parallel, ParallelOutputZarr +from podpac.core.managers.multi_process import Process diff --git a/podpac/pipeline.py b/podpac/pipeline.py deleted file mode 100644 index 3488ec1ed..000000000 --- a/podpac/pipeline.py +++ /dev/null @@ -1,8 +0,0 @@ -""" -Pipeline Public Module -""" - -# REMINDER: update api docs (doc/source/user/api.rst) to reflect changes to this file - -from podpac.core.pipeline import Pipeline, PipelineError -from podpac.core.pipeline.output import Output, NoOutput, FileOutput, FTPOutput, S3Output, ImageOutput diff --git a/podpac/style.py b/podpac/style.py new file mode 100644 index 000000000..f45dc25bd --- /dev/null +++ b/podpac/style.py @@ -0,0 +1,8 @@ +""" +Style Public Module +""" + +# REMINDER: update api docs (doc/source/api.rst) to reflect changes to this file + + +from podpac.core.style import Style diff --git a/podpac/utils.py b/podpac/utils.py index 84a47ee86..2367cdd54 100644 --- a/podpac/utils.py +++ b/podpac/utils.py @@ -2,8 +2,9 @@ Utils Public Module """ -# REMINDER: update api docs (doc/source/user/api.rst) to reflect changes to this file +# REMINDER: update api docs (doc/source/api.rst) to reflect changes to this file -from podpac.core.utils import create_logfile +from podpac.core.utils import create_logfile, cached_property, NodeTrait from podpac.core.cache import clear_cache +from podpac.core.node import NoCacheMixin, DiskCacheMixin diff --git a/pyproject.toml b/pyproject.toml index 6815edb68..55ec8d784 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,3 +1,2 @@ [tool.black] line-length = 120 -target-version = ['py38', 'py36', 'py37'] \ No newline at end of file diff --git a/setup.py b/setup.py index 53c8bc753..d93d80a08 100644 --- a/setup.py +++ b/setup.py @@ -23,13 +23,19 @@ "traitlets>=4.3", "xarray>=0.10", "requests>=2.18", - "pyproj>=2.4", "lazy-import>=0.2.2", "psutil", ] if sys.version_info.major == 2: - install_requires += ["future>=0.16"] + install_requires += [ + "future>=0.16", + "pyproj>=2.2" + ] +else: + install_requires += [ + "pyproj>=2.4" + ] extras_require = { "datatype": [ @@ -68,9 +74,6 @@ "pytest-html>=1.7.0", "pytest-remotedata>=0.3.1", "recommonmark>=0.6", - "sphinx>=2.3", - "sphinx-rtd-theme>=0.4", - "sphinx-autobuild>=0.7", "coveralls>=1.3", "six>=1.0", "attrs>=17.4.0", @@ -79,9 +82,16 @@ } if sys.version_info.major == 2: - extras_require["dev"] += ["pytest>=3.3.2"] + extras_require["dev"] += [ + "pytest>=3.3.2" + ] else: - extras_require["dev"] += ["pytest>=5.0"] + extras_require["dev"] += [ + "sphinx>=2.3", + "sphinx-rtd-theme>=0.4", + "sphinx-autobuild>=0.7", + "pytest>=5.0" + ] if sys.version >= '3.6': extras_require["dev"] += [