In [4]:
from pangeo_forge_recipes.recipes import XarrayZarrRecipe
from pangeo_forge_recipes.patterns import pattern_from_file_sequence
import pandas as pd
import aiohttp #Is auth still done this way?
import os

In [37]:
#move auth to correct place
os.environ['GPM_IMERG_USERNAME'] = 'alex@developmentseed.org'
os.environ['GPM_IMERG_PASSWORD'] = 'alex@developmentseed.org'

In [31]:
# Figure out the average file size for estimating files per chunk
import xarray as xr

ds = xr.open_dataset("../../data/3B-HHR.MS.MRG.3IMERG.20210303-S040000-E042959.0240.V06B.HDF5", group="Grid")
ds


In [32]:
print(f"File size is {ds.nbytes/1e6} MB")

File size is 285.184824 MB


In [26]:
#url example
# https://arthurhouhttps.pps.eosdis.nasa.gov/gpmdata/2021/04/30/imerg/3B-HHR.MS.MRG.3IMERG.20210430-S000000-E002959.0000.V06B.HDF5
# Login is via the PPS system, username/password is the email address you register with
input_url_pattern = (
    "https://arthurhouhttps.pps.eosdis.nasa.gov/gpmdata/{yyyy}/{mm}/{dd}/imerg/3B-HHR.MS.MRG.3IMERG.{yyyymmdd}-S{sh}{sm}00-E{eh}{em}59.{MMMM}.V06B.HDF5"
)

In [29]:
#June 2000 to 
dates = pd.date_range("2000-06-01T00:00:00", "2021-05-31T23:59:59", freq="30min")
input_urls = [
    input_url_pattern.format(
        yyyy=hhr.strftime("%Y"),
        mm = hhr.strftime("%m"),
        dd = hhr.strftime("%d"),
        yyyymmdd=hhr.strftime("%Y%m%d"),
        sh = hhr.strftime("%H"),
        sm = hhr.strftime("%M"),
        eh = hhr.strftime("%H"),
        em = (hhr+pd.Timedelta("29 min")).strftime("%M"),
        MMMM = f'{(hhr.hour*60 + hhr.minute):04}'
    )
    for hhr in dates
]
print(f"Found {len(input_urls)} files!")
input_urls[-1001]


Found 368160 files!


'https://arthurhouhttps.pps.eosdis.nasa.gov/gpmdata/2021/05/11/imerg/3B-HHR.MS.MRG.3IMERG.20210511-S033000-E035959.0210.V06B.HDF5'

In [33]:
pattern = pattern_from_file_sequence(input_urls, "time", nitems_per_file=1)
pattern

<FilePattern {'time': 368160}>

In [34]:
for key in pattern:
    break
key

(DimIndex(name='time', index=0, sequence_len=368160, operation=<CombineOp.CONCAT: 2>))

In [35]:
pattern[key]

'https://arthurhouhttps.pps.eosdis.nasa.gov/gpmdata/2000/06/01/imerg/3B-HHR.MS.MRG.3IMERG.20000601-S000000-E002959.0000.V06B.HDF5'

In [51]:
recipe = XarrayZarrRecipe(
    pattern, 
    xarray_open_kwargs={'group': 'Grid', 'drop_variables': ['time_bnds', 'lon_bnds', 'lat_bnds']},
    #fsspec_open_kwargs={'auth': aiohttp.BasicAuth(os.environ['GPM_IMERG_USERNAME'], os.environ['GPM_IMERG_PASSWORD'])},
    fsspec_open_kwargs={'username': os.environ['GPM_IMERG_USERNAME'], 'password': os.environ['GPM_IMERG_PASSWORD']},
    inputs_per_chunk=1
)
recipe

XarrayZarrRecipe(file_pattern=<FilePattern {'time': 368160}>, inputs_per_chunk=1, target_chunks={}, target=None, input_cache=None, metadata_cache=None, cache_inputs=True, copy_input_to_local_file=False, consolidate_zarr=True, xarray_open_kwargs={'group': 'Grid', 'drop_variables': ['time_bnds', 'lon_bnds', 'lat_bnds']}, xarray_concat_kwargs={}, delete_input_encoding=True, fsspec_open_kwargs={'username': 'alex@developmentseed.org', 'password': 'alex@developmentseed.org'}, process_input=None, process_chunk=None, lock_timeout=None, subset_inputs={}, is_opendap=False)

In [46]:
# Testing
import logging
logger = logging.getLogger("pangeo_forge_recipes")
formatter = logging.Formatter('%(name)s:%(levelname)s - %(message)s')
handler = logging.StreamHandler()
handler.setLevel(logging.INFO)
handler.setFormatter(formatter)
logger.setLevel(logging.INFO)
logger.addHandler(handler)

In [40]:
all_inputs = list(recipe.iter_inputs())
len(all_inputs)

368160

In [41]:
all_chunks = list(recipe.iter_chunks())
len(all_chunks)

368160

In [54]:
%xmode Plain
with recipe.open_chunk(all_chunks[0]) as ds:
    display(ds)

pangeo_forge_recipes.recipes.xarray_zarr:INFO - Opening inputs for chunk time-0
pangeo_forge_recipes.recipes.xarray_zarr:INFO - Opening inputs for chunk time-0
pangeo_forge_recipes.recipes.xarray_zarr:INFO - Opening input with Xarray time-0: 'https://arthurhouhttps.pps.eosdis.nasa.gov/gpmdata/2000/06/01/imerg/3B-HHR.MS.MRG.3IMERG.20000601-S000000-E002959.0000.V06B.HDF5'
pangeo_forge_recipes.recipes.xarray_zarr:INFO - Opening input with Xarray time-0: 'https://arthurhouhttps.pps.eosdis.nasa.gov/gpmdata/2000/06/01/imerg/3B-HHR.MS.MRG.3IMERG.20000601-S000000-E002959.0000.V06B.HDF5'
pangeo_forge_recipes.storage:INFO - Opening 'https://arthurhouhttps.pps.eosdis.nasa.gov/gpmdata/2000/06/01/imerg/3B-HHR.MS.MRG.3IMERG.20000601-S000000-E002959.0000.V06B.HDF5' directly.
pangeo_forge_recipes.storage:INFO - Opening 'https://arthurhouhttps.pps.eosdis.nasa.gov/gpmdata/2000/06/01/imerg/3B-HHR.MS.MRG.3IMERG.20000601-S000000-E002959.0000.V06B.HDF5' directly.


Exception reporting mode: Plain


ClientResponseError: 401, message='Unauthorized', url=URL('https://arthurhouhttps.pps.eosdis.nasa.gov/gpmdata/2000/06/01/imerg/3B-HHR.MS.MRG.3IMERG.20000601-S000000-E002959.0000.V06B.HDF5')

In [43]:
all_chunks[1]

(DimIndex(name='time', index=1, sequence_len=368160, operation=<CombineOp.CONCAT: 2>))

In [44]:
import tempfile
from fsspec.implementations.local import LocalFileSystem
from pangeo_forge_recipes.storage import FSSpecTarget, CacheFSSpecTarget

fs_local = LocalFileSystem()

cache_dir = tempfile.TemporaryDirectory()
cache_target = CacheFSSpecTarget(fs_local, cache_dir.name)

target_dir = tempfile.TemporaryDirectory()
target = FSSpecTarget(fs_local, target_dir.name)

recipe.input_cache = cache_target
recipe.target = target
recipe

XarrayZarrRecipe(file_pattern=<FilePattern {'time': 368160}>, inputs_per_chunk=1, target_chunks={}, target=FSSpecTarget(fs=<fsspec.implementations.local.LocalFileSystem object at 0x7f84df934550>, root_path='/tmp/tmp2phhsse6'), input_cache=CacheFSSpecTarget(fs=<fsspec.implementations.local.LocalFileSystem object at 0x7f84df934550>, root_path='/tmp/tmpqq8y5pzq'), metadata_cache=None, cache_inputs=True, copy_input_to_local_file=False, consolidate_zarr=True, xarray_open_kwargs={'group': 'Grid', 'drop_variables': ['time_bnds', 'lon_bnds', 'lat_bnds']}, xarray_concat_kwargs={}, delete_input_encoding=True, fsspec_open_kwargs={'client_kwargs': {'auth': BasicAuth(login='alex@developmentseed.org', password='alex@developmentseed.org', encoding='latin1')}}, process_input=None, process_chunk=None, lock_timeout=None, subset_inputs={}, is_opendap=False)

In [None]:
for input_file in recipe.inputs_for_chunk(all_chunks[0]):
    recipe.cache_input(input_file)

In [49]:
XarrayZarrRecipe?

[0;31mInit signature:[0m
[0mXarrayZarrRecipe[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mfile_pattern[0m[0;34m:[0m [0mpangeo_forge_recipes[0m[0;34m.[0m[0mpatterns[0m[0;34m.[0m[0mFilePattern[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0minputs_per_chunk[0m[0;34m:[0m [0mint[0m [0;34m=[0m [0;36m1[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtarget_chunks[0m[0;34m:[0m [0mDict[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mint[0m[0;34m][0m [0;34m=[0m [0;34m<[0m[0mfactory[0m[0;34m>[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtarget[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mpangeo_forge_recipes[0m[0;34m.[0m[0mstorage[0m[0;34m.[0m[0mAbstractTarget[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0minput_cache[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mpangeo_forge_recipes[0m[0;34m.[0m[0mstorage[0m[0;34m.[0m[0mCacheFSSpecTarget[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m 

In [55]:
from fsspec import open

In [60]:
test = open("https://arthurhouhttps.pps.eosdis.nasa.gov/gpmdata/2021/03/03/gis/3B-DAY-GIS.MS.MRG.3IMERG.20210303-S000000-E235959.1830.V06B.tif",
           username=os.environ['GPM_IMERG_USERNAME'],
           password=os.environ['GPM_IMERG_PASSWORD']
           )

In [61]:
with test as f:
    ds = xr.open_dataset(f)
    ds

FileNotFoundError: https://arthurhouhttps.pps.eosdis.nasa.gov/gpmdata/2021/03/03/gis/3B-DAY-GIS.MS.MRG.3IMERG.20210303-S000000-E235959.1830.V06B.tif