In [2]:
%matplotlib inline
import requests
import datetime as dt 
import pandas as pd
import geopandas as gpd
import contextily as ctx 
from shapely.geometry import MultiPolygon, Polygon, box
from shapely.ops import orient

import pygc
import h5py
import numpy as np
from glob import glob
import os
from os import path
from shapely.geometry import MultiPolygon, Polygon
from shapely.ops import orient
import fiona

from csv import writer

#### 1. Search with bounding box
NASA EarthData's unique ID for this dataset (called `Concept ID`) is needed for searching the dataset. The dataset Digital Object Identifier or DOI can be used to obtain the `Concept ID`.

In [3]:
# variables for data search

# downloaded data file paths
h5_directory = 'C:/gedi_data'
gedi_shp_directory = './gedi_shp'
outdir = './subsets'

# coordinates file path
main_datafile_path = "estingAustralia.csv"

# lengths in m
ew_width = 1000
ns_height = 1000
size = int(ew_width/1000)

start_date = dt.datetime(2022, 1, 1) # specify your own start date
end_date = dt.datetime(2022, 6, 1)  # specify your end start date

In [4]:
doi = '10.3334/ORNLDAAC/2017'# GEDI L4A DOI 

# CMR API base url
cmrurl='https://cmr.earthdata.nasa.gov/search/' 

doisearch = cmrurl + 'collections.json?doi=' + doi
response = requests.get(doisearch)
response.raise_for_status()
concept_id = response.json()['feed']['entry'][0]['id']

print(concept_id)

C2244602422-ORNL_CLOUD


In [5]:
# function to obtain bounding box from coordinate
def latLonBoxByWandH(lat,lon,ew_width,ns_height,site):
    lats, lons = [], []
    #distance in m, az (in deg), lat (in deg), long (in deg)

    res = pygc.great_circle(distance=ew_width/2, azimuth=90, latitude=lat, longitude=lon)
    lat, lon = res['latitude'], res['longitude']

    res = pygc.great_circle(distance=ns_height/2, azimuth=180, latitude=lat, longitude=lon)
    lat, lon = res['latitude'], res['longitude']
    lats.append(lat), lons.append(lon)

    res = pygc.great_circle(distance=ew_width, azimuth=270, latitude=lat, longitude=lon)
    lat, lon = res['latitude'], res['longitude']
    lats.append(lat), lons.append(lon)

    res = pygc.great_circle(distance=ns_height, azimuth=0, latitude=lat, longitude=lon)
    lat, lon = res['latitude'], res['longitude']
    lats.append(lat), lons.append(lon)

    res = pygc.great_circle(distance=ew_width, azimuth=90, latitude=lat, longitude=lon)
    lat, lon = res['latitude'], res['longitude']
    lats.append(lat), lons.append(lon)
    
    return {'lats':lats,'lons':lons, 'site': site}

In [6]:
# loading coordinates file
treecoords = pd.read_csv(main_datafile_path)
treecoords

Unnamed: 0.1,Unnamed: 0,project,site,lat,long,Granule Number
0,0,SouthWestForests-DON019FireInv,k_1,-34.7310,116.2081,2
1,1,SouthWestForests-DON019FireInv,k_2,-34.7265,116.2081,2
2,2,SouthWestForests-DON019FireInv,k_3,-34.6949,116.2085,2
3,3,SouthWestForests-DON019FireInv,k_4,-34.7265,116.2136,1
4,4,SouthWestForests-DON019FireInv,k_5,-34.7221,116.2136,1
...,...,...,...,...,...,...
241,318,LIRE,k_242,-41.3530,147.5222,1
242,319,Ausplot Forest Monitoring Network,k_243,-41.3671,147.6032,3
243,320,LIPL,k_244,-42.4391,147.7789,3
244,321,LIPL,k_245,-42.7232,147.8451,5


In [7]:
#file cleanup
def removefiles(directory, ext) :
    for filename in os.listdir(directory):
        f = os.path.join(directory, filename)
        # checking if it is a file
        if ext == "any" :
            if os.path.isfile(f):
                os.remove(f)
        else:
            if f.endswith(ext):
                os.remove(f)

In [8]:
# function to clean downloaded files
def renamefiles(directory) :
    for filename in os.listdir(directory):
        f = os.path.join(directory, filename)
        # checking if it is a file
        if os.path.isfile(f):
            pre, ext = os.path.splitext(f)
            os.rename(f, pre + '.h5')

In [9]:
# function to create ESRI shapefile
def createshp(dataframe) :
    # define schema
    schema = {
        'geometry':'Polygon',
        'properties':[('Name','str')]
    }

    #open a fiona object
    polyShp = fiona.open('gedi_shp/cropcopymark.shp', mode='w', driver='ESRI Shapefile',
            schema = schema, crs = "EPSG:4326")

    #get list of points
    xyList = []
    rowName = ''
    for index, row in dataframe.iterrows():
        xyList.append((row.lons,row.lats))
        rowName = row.site
    xyList[:5]

    #save record and close shapefile
    rowDict = {
    'geometry' : {'type':'Polygon',
                    'coordinates': [xyList]}, #Here the xyList is in brackets
    'properties': {'Name' : rowName},
    }
    polyShp.write(rowDict)
    #close fiona object
    polyShp.close()
    

In [10]:
# function to look for dataset with polygon
def lookdata(grsm_poly):
    # CMR formatted start and end times
    dt_format = '%Y-%m-%dT%H:%M:%SZ'
    temporal_str = start_date.strftime(dt_format) + ',' + end_date.strftime(dt_format)
    
    # converting to WGS84 coordinate system
    grsm_epsg4326 = grsm_poly.to_crs(epsg=4326)

    # orienting coordinates clockwise
    grsm_epsg4326.geometry = grsm_epsg4326.geometry.apply(orient, args=(1,))

    # reducing number of vertices in the polygon
    # CMR has 1000000 bytes limit
    grsm_epsg4326 = grsm_epsg4326.simplify(0.0005)

    geojson = {"shapefile": ("grsm.json", grsm_epsg4326.geometry.to_json(), "application/geo+json")}

    page_num = 1
    page_size = 2000 # CMR page size limit

    granule_arr = []

    while True:
        
        # defining parameters
        cmr_param = {
            "collection_concept_id": concept_id, 
            "page_size": page_size,
            "page_num": page_num,
            "temporal": temporal_str,
            "simplify-shapefile": 'true' # this is needed to bypass 5000 coordinates limit of CMR
        }
        
        granulesearch = cmrurl + 'granules.json'
        response = requests.post(granulesearch, data=cmr_param, files=geojson)
        granules = response.json()['feed']['entry']
        
        if granules:
            for g in granules:
                granule_url = ''
                granule_poly = ''
                
                # read file size
                granule_size = float(g['granule_size'])
                
                # reading bounding geometries
                if 'polygons' in g:
                    polygons= g['polygons']
                    multipolygons = []
                    for poly in polygons:
                        i=iter(poly[0].split(" "))
                        ltln = list(map(" ".join,zip(i,i)))
                        multipolygons.append(Polygon([[float(p.split(" ")[1]), float(p.split(" ")[0])] for p in ltln]))
                    granule_poly = MultiPolygon(multipolygons)
                
                # Get URL of HDF5 files
                for links in g['links']:
                    if 'title' in links and links['title'].startswith('Download') \
                    and links['title'].endswith('.h5'):
                        granule_url = links['href']
                granule_arr.append([granule_url, granule_size, granule_poly])
                
            page_num += 1
        else: 
            break

    # adding bound as the last row into the dataframe
    # we will use this later in the plot
    granule_arr.append(['GRSM', 0, grsm_epsg4326.geometry.item() ]) 

    # creating a pandas dataframe
    l4adf = pd.DataFrame(granule_arr, columns=["granule_url", "granule_size", "granule_poly"])

    # Drop granules with empty geometry
    l4adf = l4adf[l4adf['granule_poly'] != '']
    return l4adf

In [11]:
# function to loop through all downloaded files and create clipped versions
def clipping(indir, outdir, grsm_poly) :
    # converting to WGS84 coordinate system
    grsm_epsg4326 = grsm_poly.to_crs(epsg=4326)

    for infile in glob(path.join(indir, 'GEDI04_A*.h5')):
        name, ext = path.splitext(path.basename(infile))
        subfilename = "{name}_sub{ext}".format(name=name, ext=ext)
        outfile = path.join(outdir, path.basename(subfilename))
        hf_in = h5py.File(infile, 'r')
        hf_out = h5py.File(outfile, 'w')
        
        # copy ANCILLARY and METADATA groups
        var1 = ["/ANCILLARY", "/METADATA"]
        for v in var1:
            hf_in.copy(hf_in[v],hf_out)
        
        # loop through BEAMXXXX groups
        for v in list(hf_in.keys()):
            if v.startswith('BEAM'):
                beam = hf_in[v]
                # find the shots that overlays the area of interest (GRSM)
                lat = beam['lat_lowestmode'][:]
                lon = beam['lon_lowestmode'][:]
                i = np.arange(0, len(lat), 1) # index
                geo_arr = list(zip(lat,lon, i))
                l4adf = pd.DataFrame(geo_arr, columns=["lat_lowestmode", "lon_lowestmode", "i"])
                l4agdf = gpd.GeoDataFrame(l4adf, geometry=gpd.points_from_xy(l4adf.lon_lowestmode, l4adf.lat_lowestmode))
                l4agdf.crs = "EPSG:4326"
                l4agdf_gsrm = l4agdf[l4agdf['geometry'].within(grsm_epsg4326.geometry[0])]  
                indices = l4agdf_gsrm.i

                # copy BEAMS to the output file
                for key, value in beam.items():
                    if isinstance(value, h5py.Group):
                        for key2, value2 in value.items():
                            group_path = value2.parent.name
                            group_id = hf_out.require_group(group_path)
                            dataset_path = group_path + '/' + key2
                            hf_out.create_dataset(dataset_path, data=value2[:][indices])
                            for attr in value2.attrs.keys():
                                hf_out[dataset_path].attrs[attr] = value2.attrs[attr]
                    else:
                        group_path = value.parent.name
                        group_id = hf_out.require_group(group_path)
                        dataset_path = group_path + '/' + key
                        hf_out.create_dataset(dataset_path, data=value[:][indices])
                        for attr in value.attrs.keys():
                            hf_out[dataset_path].attrs[attr] = value.attrs[attr]

        hf_in.close()
        hf_out.close()

In [12]:
def create_dataframe(outdir) :
    lat_l = []
    lon_l = []
    agbd = []
    for subfile in glob(path.join(outdir, 'GEDI04_A*.h5')):
        hf_in = h5py.File(subfile, 'r')
        for v in list(hf_in.keys()):
            if v.startswith('BEAM'):
                beam = hf_in[v]
                lat_l.extend(beam['lat_lowestmode'][:].tolist()) 
                lon_l.extend(beam['lon_lowestmode'][:].tolist()) 
                agbd.extend(beam['agbd'][:].tolist())
        hf_in.close()
    geo_arr = list(zip(agbd,lat_l,lon_l))
    df = pd.DataFrame(geo_arr, columns=["agbd", "lat_lowestmode", "lon_lowestmode"])
    gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.lon_lowestmode, df.lat_lowestmode))
    return gdf

In [13]:
# append to csv file: site, lat, long, abgdavg
def updatefile(site, lat, lon, agbd_avg) :
    List = [site, lat, lon, agbd_avg]
    with open('subsets/grsm_subset.csv', 'a') as f_object:
    
        # Pass this file object to csv.writer()
        # and get a writer object
        writer_object = writer(f_object)
    
        # Pass the list as an argument into
        # the writerow()
        writer_object.writerow(List)
    
        # Close the file object
        f_object.close()

In [14]:
# subset_df = create_dataframe()
for index, row in treecoords.iterrows():
    # extract important values for ESRI shapefile
    lat = row['lat']
    lon = row['long']
    site = row['site']
    bbox = latLonBoxByWandH(lat,lon,ew_width,ns_height,site)
    tmp = pd.DataFrame.from_dict(bbox, orient='columns', dtype=None, columns=None)
    
    # clean up gedi_shp file directory
    removefiles(gedi_shp_directory, "any")
    removefiles(h5_directory, ".h5")
    
    # create ESRI shapefile
    createshp(tmp)
    
    # look for matching data with polygon
    grsm_poly = gpd.read_file('gedi_shp/cropcopymark.shp')
    l4adf = lookdata(grsm_poly)
    
    # drop duplicate URLs if any
    l4a_granules = l4adf[:-1].drop_duplicates(subset=['granule_url'])
    l4a_granules.to_csv('granules.txt', columns = ['granule_url'], index=False, header = False)
    
    # clean up h5 file directory
    removefiles(h5_directory, ".h5")
    
    # download data
    !wget.exe --load-cookies .urs_cookies --save-cookies .urs_cookies --keep-session-cookies -P /gedi_data  -nc --content-disposition --trust-server-names -i granules.txt
    
    # rename downloaded files
    renamefiles(h5_directory)
    
    # loop through downloaded h5 files and create clipped versions
    if (os.path.isdir(site)) :
        pass
    else:
        newpath = os.path.join(outdir, site)
        os.mkdir(newpath)
    clipping(h5_directory,newpath, grsm_poly)
    
    # create dataframe of relevant GEDI points from clipped versions
    # subset_df = create_dataframe(outdir)
    
    # clear directories
    removefiles(h5_directory, ".h5")
    removefiles(gedi_shp_directory, "any")
    

No URLs found in granules.txt.


FileExistsError: [WinError 183] Cannot create a file when that file already exists: 'C:/gedi_data\\GEDI04_A_2022019203138_O17586_04_T07726_02_002_02_V002.h5@A-userid=yeowanli&Expires=1698599946&Signature=tTgXxByw4xBfm-uw~rhLQRb6MudBR2LnYMNW1ytQ-Rd8lKfDeddOPS6nVq3BHRzC4igN5lH-RnVRXwzuRVmMu6DvqDH8XECxN5A9ObybsRsa0yii14OCsz1AgzpKDOiF-qhCh8qb' -> 'C:/gedi_data\\GEDI04_A_2022019203138_O17586_04_T07726_02_002_02_V002.h5'