In [1]:
%pylab widget

Populating the interactive namespace from numpy and matplotlib


In [2]:
import podpac
import zarr 
import numpy as np
import os

In [3]:
PATH_TO_DATA = r'D:\SMAP_Data'
PATH_TO_PROPERTY_DATA = r'D:\SMAP_Property_Data'
PATH_TO_STORE = r'C:\SMAP.zarr'
PATH_TO_PROPERTY_STORE = r'C:\SMAP_PROPS.zarr'

In [4]:
pwd = os.getcwd()
script_path = os.path.join(pwd, 'nsidc-download_SPL3SMP_E.003_2020-01-13.py')
script_path_props = os.path.join(pwd, 'nsidc-download_SPL4SMLM.004_2020-04-01.py')

In [5]:
# Download all the SMAP data
# Run the following command for the commandline, it will prompt you for a username and password. 
# These are your Earthdata login credentials
# This took multiple days for me, and almost 1 TB of space
print("cd {}\npython {}".format(PATH_TO_DATA, script_path))
print("cd {}\npython {}".format(PATH_TO_PROPERTY_DATA, script_path_props))

cd D:\SMAP_Data
python C:\Repositories\pipeline\podpac-drought-monitor\notebooks\nsidc-download_SPL3SMP_E.003_2020-01-13.py
cd D:\SMAP_Property_Data
python C:\Repositories\pipeline\podpac-drought-monitor\notebooks\nsidc-download_SPL4SMLM.004_2020-04-01.py


# Stage the Property Data as Zarr

In [6]:
# Turn the Property data into a zarr file
file_prop = [os.path.join(PATH_TO_PROPERTY_DATA, f) for f in os.listdir(PATH_TO_PROPERTY_DATA) if f.endswith('.h5')][0]
h5props = podpac.data.H5PY(source=file_prop)
h5props.available_data_keys

['/EASE2_global_projection',
 '/Land-Model-Constants_Data/cell_elevation',
 '/Land-Model-Constants_Data/cell_land_fraction',
 '/Land-Model-Constants_Data/clsm_cdcr1',
 '/Land-Model-Constants_Data/clsm_cdcr2',
 '/Land-Model-Constants_Data/clsm_dzgt1',
 '/Land-Model-Constants_Data/clsm_dzgt2',
 '/Land-Model-Constants_Data/clsm_dzgt3',
 '/Land-Model-Constants_Data/clsm_dzgt4',
 '/Land-Model-Constants_Data/clsm_dzgt5',
 '/Land-Model-Constants_Data/clsm_dzgt6',
 '/Land-Model-Constants_Data/clsm_dzpr',
 '/Land-Model-Constants_Data/clsm_dzrz',
 '/Land-Model-Constants_Data/clsm_dzsf',
 '/Land-Model-Constants_Data/clsm_dztsurf',
 '/Land-Model-Constants_Data/clsm_poros',
 '/Land-Model-Constants_Data/clsm_veghght',
 '/Land-Model-Constants_Data/clsm_wp',
 '/Land-Model-Constants_Data/mwrtm_bh',
 '/Land-Model-Constants_Data/mwrtm_bv',
 '/Land-Model-Constants_Data/mwrtm_clay',
 '/Land-Model-Constants_Data/mwrtm_lewt',
 '/Land-Model-Constants_Data/mwrtm_omega',
 '/Land-Model-Constants_Data/mwrtm_poros

In [7]:
# Set the sizes for chunks etc
group_shape = h5props.dataset['/Land-Model-Constants_Data/clsm_poros'].shape
group_chunk = (512, 512)

In [8]:
# Make a list of the desired groups 
data_groups = ['/Land-Model-Constants_Data/clsm_poros', '/Land-Model-Constants_Data/clsm_veghght']
coord_groups = ['cell_lat', 'cell_lon']


In [9]:
# Initialize the Zarr file
zf = zarr.open(PATH_TO_PROPERTY_STORE, mode='a')
zf.attrs['crs'] = 'EPSG:4326'
zf.attrs['nan_vals'] = [-9999]

In [10]:
# Initialize the coordinates
# Latittude
tmp = h5props.dataset[coord_groups[0]][:]
tmp[tmp == -9999] = np.nan
tmp = np.nanmean(tmp, axis=1)
# Check for no nans
assert np.any(np.isnan(tmp)) == False
zf['lat'] = tmp

# Longitude
tmp = h5props.dataset[coord_groups[1]][:]
tmp[tmp == -9999] = np.nan
tmp = np.nanmean(tmp, axis=0)
# Check for no nans
assert np.any(np.isnan(tmp)) == False
zf['lon'] = tmp

In [11]:
# Store the data
for dg in data_groups:
    zft = zarr.open(os.path.join(PATH_TO_PROPERTY_STORE, dg[1:]), 
              mode='w', shape=group_shape, chunks=group_chunk, dtype='f4', fill_value=-9999)
    zft[:] = h5props.dataset[dg]

# Now Stage the SMAP data as a Zarr file

In [12]:
# get a list of the files
files = [os.path.join(PATH_TO_DATA, f) for f in os.listdir(PATH_TO_DATA) if f.endswith('.h5')]
files.sort()  # sorting is important because this orders the files with the correct time order

In [13]:
# Have a look at a file
h5 = podpac.data.H5PY(source=files[0])
h5.available_data_keys

['/EASE2_global_projection',
 '/Land-Model-Constants_Data/cell_elevation',
 '/Land-Model-Constants_Data/cell_land_fraction',
 '/Land-Model-Constants_Data/clsm_cdcr1',
 '/Land-Model-Constants_Data/clsm_cdcr2',
 '/Land-Model-Constants_Data/clsm_dzgt1',
 '/Land-Model-Constants_Data/clsm_dzgt2',
 '/Land-Model-Constants_Data/clsm_dzgt3',
 '/Land-Model-Constants_Data/clsm_dzgt4',
 '/Land-Model-Constants_Data/clsm_dzgt5',
 '/Land-Model-Constants_Data/clsm_dzgt6',
 '/Land-Model-Constants_Data/clsm_dzpr',
 '/Land-Model-Constants_Data/clsm_dzrz',
 '/Land-Model-Constants_Data/clsm_dzsf',
 '/Land-Model-Constants_Data/clsm_dztsurf',
 '/Land-Model-Constants_Data/clsm_poros',
 '/Land-Model-Constants_Data/clsm_veghght',
 '/Land-Model-Constants_Data/clsm_wp',
 '/Land-Model-Constants_Data/mwrtm_bh',
 '/Land-Model-Constants_Data/mwrtm_bv',
 '/Land-Model-Constants_Data/mwrtm_clay',
 '/Land-Model-Constants_Data/mwrtm_lewt',
 '/Land-Model-Constants_Data/mwrtm_omega',
 '/Land-Model-Constants_Data/mwrtm_poros

In [14]:
h5.dataset['/Soil_Moisture_Retrieval_Data_AM/soil_moisture']

<HDF5 dataset "soil_moisture": shape (1624, 3856), type "<f4">

In [15]:
# Set the sizes for chunks etc
group_shape = h5.dataset['/Soil_Moisture_Retrieval_Data_AM/soil_moisture'].shape + (len(files), )
group_chunk = (64, 64, 128)

In [16]:
# Make a list of the desired groups 
data_groups = ['/Soil_Moisture_Retrieval_Data_AM/soil_moisture', '/Soil_Moisture_Retrieval_Data_AM/retrieval_qual_flag',
          '/Soil_Moisture_Retrieval_Data_PM/soil_moisture_pm', '/Soil_Moisture_Retrieval_Data_PM/retrieval_qual_flag_pm']
coord_groups = ['/Soil_Moisture_Retrieval_Data_AM/latitude', '/Soil_Moisture_Retrieval_Data_AM/longitude', 'time']


In [17]:
# Initialize the Zarr file
zf = zarr.open(PATH_TO_STORE, mode='a')
zf.attrs['crs'] = 'EPSG:4326'
zf.attrs['nan_vals'] = [-9999]

In [18]:
# Initialize the coordinates
# Latittude
tmp = h5.dataset[coord_groups[0]][:]
tmp[tmp == -9999] = np.nan
tmp = np.nanmean(tmp, axis=1)
# Check for no nans
assert np.any(np.isnan(tmp)) == False
zf['lat'] = tmp

# Longitude
tmp = h5.dataset[coord_groups[1]][:]
tmp[tmp == -9999] = np.nan
tmp = np.nanmean(tmp, axis=0)
# Check for no nans
assert np.any(np.isnan(tmp)) == False
zf['lon'] = tmp

# Time
tmp = [f.split('_')[-3] for f in files]
tmp = [np.datetime64(f[:4] + '-' + f[4:6] + '-' + f[6:]) for f in tmp]
zf['time'] = tmp

In [19]:
# Initialize the data arrays
for dg in data_groups:
    zft = zarr.open(os.path.join(PATH_TO_STORE, dg[1:]), 
              mode='a', shape=group_shape, chunks=group_chunk, dtype='f4', fill_value=-9999)
    zft.attrs['_ARRAY_DIMENSIONS'] = ['lat', 'lon', 'time']

In [20]:
# Save all of the h5 files into the zarr format
data = {dg: [] for dg in data_groups}
errors = []
failed_data = -9999 * np.ones(group_shape[:2])
for i, f in enumerate(files):
    print ("Converting #{} of {}:  {}...".format(i + 1, len(files), f))
    if i < 1664: 
        if (i + 1) % group_chunk[2] == 0:
            print("Writing to Zarr {}:{}".format(i - group_chunk[2] + 1, i + 1), end=':')
            for dg in data_groups:
                print(dg, end=' .. ')
        continue
    h5 = podpac.data.H5PY(source=f)
    for dg in data_groups:
        try:
            raw = h5.dataset[dg][:]
        except Exception as e:
            errors.append((f, dg, e))
            raw = failed_data
        data[dg].append(raw)
    if (i + 1) % group_chunk[2] == 0:
        print("Writing to Zarr {}:{}".format(i - group_chunk[2] + 1, i + 1), end=':')
        for dg in data_groups:
            print(dg, end=' .. ')
            zf[dg][:, :, i - group_chunk[2] + 1: i + 1] = np.stack(data[dg], axis=2)
        data = {dg: [] for dg in data_groups}

Converting #1 of 1759:  D:\SMAP_Data\SMAP_L3_SM_P_E_20150331_R16510_001.h5...
Converting #2 of 1759:  D:\SMAP_Data\SMAP_L3_SM_P_E_20150401_R16510_001.h5...
Converting #3 of 1759:  D:\SMAP_Data\SMAP_L3_SM_P_E_20150402_R16510_001.h5...
Converting #4 of 1759:  D:\SMAP_Data\SMAP_L3_SM_P_E_20150403_R16510_001.h5...
Converting #5 of 1759:  D:\SMAP_Data\SMAP_L3_SM_P_E_20150404_R16510_001.h5...
Converting #6 of 1759:  D:\SMAP_Data\SMAP_L3_SM_P_E_20150405_R16510_001.h5...
Converting #7 of 1759:  D:\SMAP_Data\SMAP_L3_SM_P_E_20150406_R16510_001.h5...
Converting #8 of 1759:  D:\SMAP_Data\SMAP_L3_SM_P_E_20150407_R16510_001.h5...
Converting #9 of 1759:  D:\SMAP_Data\SMAP_L3_SM_P_E_20150408_R16510_001.h5...
Converting #10 of 1759:  D:\SMAP_Data\SMAP_L3_SM_P_E_20150409_R16510_001.h5...
Converting #11 of 1759:  D:\SMAP_Data\SMAP_L3_SM_P_E_20150410_R16510_001.h5...
Converting #12 of 1759:  D:\SMAP_Data\SMAP_L3_SM_P_E_20150411_R16510_001.h5...
Converting #13 of 1759:  D:\SMAP_Data\SMAP_L3_SM_P_E_20150412

In [26]:
print("Writing to Zarr {}:{}".format(i - group_chunk[2] + 1, i + 1), end=':')
for dg in data_groups:
    print(dg, end=' .. ')
    zf[dg][:, :, i - len(data[dg]) + 1: i + 1] = np.stack(data[dg], axis=2)
data = {dg: [] for dg in data_groups}

Writing to Zarr 1631:1759:/Soil_Moisture_Retrieval_Data_AM/soil_moisture .. /Soil_Moisture_Retrieval_Data_AM/retrieval_qual_flag .. /Soil_Moisture_Retrieval_Data_PM/soil_moisture_pm .. /Soil_Moisture_Retrieval_Data_PM/retrieval_qual_flag_pm .. 

In [27]:
errors

[]

In [22]:
# Upload the files to S3 
# (Note, I already tested this on the commandline beforehand, so the outputs of this cell are not representative)
# (I also had to run this command multiple times because of my poor internet connection)
# (Also, the warning about file associations is due to my Windows installation, and not relevant)
!aws s3 sync $PATH_TO_STORE s3://podpac-drought-monitor-s3/SMAP.zarr

Completed 65 Bytes/~89 Bytes (560 Bytes/s) with ~2 file(s) remaining (calculating...)
upload: ..\..\..\..\SMAP.zarr\.zattrs to s3://podpac-drought-monitor-s3/SMAP.zarr/.zattrs
Completed 65 Bytes/~89 Bytes (560 Bytes/s) with ~1 file(s) remaining (calculating...)
Completed 89 Bytes/~89 Bytes (354 Bytes/s) with ~1 file(s) remaining (calculating...)
upload: ..\..\..\..\SMAP.zarr\.zgroup to s3://podpac-drought-monitor-s3/SMAP.zarr/.zgroup
Completed 89 Bytes/~89 Bytes (354 Bytes/s) with ~0 file(s) remaining (calculating...)
Completed 461 Bytes/~46.5 MiB (33 Bytes/s) with ~927 file(s) remaining (calculating...)
upload: ..\..\..\..\SMAP.zarr\Soil_Moisture_Retrieval_Data_AM\retrieval_qual_flag\.zarray to s3://podpac-drought-monitor-s3/SMAP.zarr/Soil_Moisture_Retrieval_Data_AM/retrieval_qual_flag/.zarray
Completed 461 Bytes/~46.5 MiB (33 Bytes/s) with ~926 file(s) remaining (calculating...)
Completed 30.8 KiB/~46.5 MiB (2.1 KiB/s) with ~926 file(s) remaining (calculating...)  
upload: ..\..\..\.

File association not found for extension .py
upload failed: ..\..\..\..\SMAP.zarr\Soil_Moisture_Retrieval_Data_AM\retrieval_qual_flag\0.7.0 to s3://podpac-drought-monitor-s3/SMAP.zarr/Soil_Moisture_Retrieval_Data_AM/retrieval_qual_flag/0.7.0 Could not connect to the endpoint URL: "https://podpac-drought-monitor-s3.s3.amazonaws.com/SMAP.zarr/Soil_Moisture_Retrieval_Data_AM/retrieval_qual_flag/0.7.0"
upload failed: ..\..\..\..\SMAP.zarr\Soil_Moisture_Retrieval_Data_AM\retrieval_qual_flag\0.60.9 to s3://podpac-drought-monitor-s3/SMAP.zarr/Soil_Moisture_Retrieval_Data_AM/retrieval_qual_flag/0.60.9 Could not connect to the endpoint URL: "https://podpac-drought-monitor-s3.s3.amazonaws.com/SMAP.zarr/Soil_Moisture_Retrieval_Data_AM/retrieval_qual_flag/0.60.9"
upload failed: ..\..\..\..\SMAP.zarr\Soil_Moisture_Retrieval_Data_AM\retrieval_qual_flag\1.34.11 to s3://podpac-drought-monitor-s3/SMAP.zarr/Soil_Moisture_Retrieval_Data_AM/retrieval_qual_flag/1.34.11 Could not connect to the endpoint URL