In [1]:
import podpac
import zarr 
import numpy as np
import os

In [2]:
PATH_TO_DATA = r'D:\SMAP_Data'
PATH_TO_STORE = r'C:\SMAP.zarr'

In [3]:
pwd = os.getcwd()
script_path = os.path.join(pwd, 'nsidc-download_SPL3SMP_E.003_2020-01-13.py')
os.chdir(PATH_TO_DATA)
print ("Changed from {} to {}".format(pwd, os.getcwd()))

Changed from C:\Repositories\pipeline\podpac-drought-monitor\notebooks to D:\SMAP_Data


In [4]:
# Download all the SMAP data
# This took multiple days for me, and almost 1 TB of space
# %run $script_path

In [5]:
# get a list of the files
files = [os.path.join(PATH_TO_DATA, f) for f in os.listdir(PATH_TO_DATA) if f.endswith('.h5')]
files.sort()

In [6]:
# Have a look at a file
h5 = podpac.data.H5PY(source=files[0])
h5.available_keys

['/Soil_Moisture_Retrieval_Data_AM/EASE_column_index',
 '/Soil_Moisture_Retrieval_Data_AM/EASE_row_index',
 '/Soil_Moisture_Retrieval_Data_AM/albedo',
 '/Soil_Moisture_Retrieval_Data_AM/boresight_incidence',
 '/Soil_Moisture_Retrieval_Data_AM/bulk_density',
 '/Soil_Moisture_Retrieval_Data_AM/clay_fraction',
 '/Soil_Moisture_Retrieval_Data_AM/freeze_thaw_fraction',
 '/Soil_Moisture_Retrieval_Data_AM/grid_surface_status',
 '/Soil_Moisture_Retrieval_Data_AM/latitude',
 '/Soil_Moisture_Retrieval_Data_AM/latitude_centroid',
 '/Soil_Moisture_Retrieval_Data_AM/longitude',
 '/Soil_Moisture_Retrieval_Data_AM/longitude_centroid',
 '/Soil_Moisture_Retrieval_Data_AM/radar_water_body_fraction',
 '/Soil_Moisture_Retrieval_Data_AM/retrieval_qual_flag',
 '/Soil_Moisture_Retrieval_Data_AM/roughness_coefficient',
 '/Soil_Moisture_Retrieval_Data_AM/soil_moisture',
 '/Soil_Moisture_Retrieval_Data_AM/soil_moisture_error',
 '/Soil_Moisture_Retrieval_Data_AM/static_water_body_fraction',
 '/Soil_Moisture_Retr

In [7]:
h5.dataset['/Soil_Moisture_Retrieval_Data_AM/soil_moisture']

<HDF5 dataset "soil_moisture": shape (1624, 3856), type "<f4">

In [9]:
# Set the sizes for chunks etc
group_shape = h5.dataset['/Soil_Moisture_Retrieval_Data_AM/soil_moisture'].shape + (len(files), )
group_chunk = (64, 64, 128)

In [10]:
# Make a list of the desired groups and their sizes
data_groups = ['/Soil_Moisture_Retrieval_Data_AM/soil_moisture', '/Soil_Moisture_Retrieval_Data_AM/retrieval_qual_flag',
          '/Soil_Moisture_Retrieval_Data_PM/soil_moisture_pm', '/Soil_Moisture_Retrieval_Data_PM/retrieval_qual_flag_pm']
coord_groups = ['/Soil_Moisture_Retrieval_Data_AM/latitude', '/Soil_Moisture_Retrieval_Data_AM/longitude', 'time']


In [18]:
# Initialize the Zarr file
zf = zarr.open(PATH_TO_STORE, mode='a')
zf.attrs['crs'] = 'EPSG:4326'
zf.attrs['nan_vals'] = [-9999]

In [16]:
# Initialize the coordinates
# Latittude
tmp = h5.dataset[coord_groups[0]][:]
tmp[tmp == -9999] = np.nan
tmp = np.nanmean(tmp, axis=1)
# Check for no nans
assert np.any(np.isnan(tmp)) == False
zf['lat'] = tmp

# Longitude
tmp = h5.dataset[coord_groups[1]][:]
tmp[tmp == -9999] = np.nan
tmp = np.nanmean(tmp, axis=0)
# Check for no nans
assert np.any(np.isnan(tmp)) == False
zf['lon'] = tmp

# Time
tmp = [f.split('_')[-3] for f in files]
tmp = [np.datetime64(f[:4] + '-' + f[4:6] + '-' + f[6:]) for f in tmp]
zf['time'] = tmp

In [31]:
# Initialize the data arrays
for dg in data_groups:
    zft = zarr.open(os.path.join(PATH_TO_STORE, dg[1:]), 
              mode='w', shape=group_shape, chunks=group_chunk, dtype='f4', fill_value=-9999)

In [33]:
# Save all of the h5 files into the zarr format
for i, f in enumerate(files):
    print ("Converting #{} of {}:  {}...".format(i + 1, len(files), f))
    h5 = podpac.data.H5PY(source=files[0])
    for dg in data_groups:
        zf[dg][:, :, i] = h5.dataset[dg]

Converting #0 of 1759:  D:\SMAP_Data\SMAP_L3_SM_P_E_20150331_R16510_001.h5...
Converting #1 of 1759:  D:\SMAP_Data\SMAP_L3_SM_P_E_20150401_R16510_001.h5...
Converting #2 of 1759:  D:\SMAP_Data\SMAP_L3_SM_P_E_20150402_R16510_001.h5...
Converting #3 of 1759:  D:\SMAP_Data\SMAP_L3_SM_P_E_20150403_R16510_001.h5...
Converting #4 of 1759:  D:\SMAP_Data\SMAP_L3_SM_P_E_20150404_R16510_001.h5...
Converting #5 of 1759:  D:\SMAP_Data\SMAP_L3_SM_P_E_20150405_R16510_001.h5...
Converting #6 of 1759:  D:\SMAP_Data\SMAP_L3_SM_P_E_20150406_R16510_001.h5...
Converting #7 of 1759:  D:\SMAP_Data\SMAP_L3_SM_P_E_20150407_R16510_001.h5...
Converting #8 of 1759:  D:\SMAP_Data\SMAP_L3_SM_P_E_20150408_R16510_001.h5...
Converting #9 of 1759:  D:\SMAP_Data\SMAP_L3_SM_P_E_20150409_R16510_001.h5...
Converting #10 of 1759:  D:\SMAP_Data\SMAP_L3_SM_P_E_20150410_R16510_001.h5...
Converting #11 of 1759:  D:\SMAP_Data\SMAP_L3_SM_P_E_20150411_R16510_001.h5...
Converting #12 of 1759:  D:\SMAP_Data\SMAP_L3_SM_P_E_20150412_

In [7]:
# Upload the files to S3 
# (Note, I already tested this on the commandline beforehand, so the outputs of this cell are not representative)
# (Also, the warning about file associations is due to my Windows installation, and not relevant)
!aws s3 sync $PATH_TO_STORE s3://podpac-drought-monitor-s3/SMAP.zarr

File association not found for extension .py
