# Inventory list and status check - GEDI 

In [49]:
import pandas as pd
import datetime
import os

'04-07-2023'

## Inventory list

### Data Pool with all files
L1/L2 : https://e4ftl01.cr.usgs.gov/GEDI/

L3/L4 : https://daac.ornl.gov/daacdata/gedi/

In [13]:
#!pip install beautifulsoup4

In [174]:
avail_products = ["GEDI01_B.002","GEDI02_A.002","GEDI02_B.002"]
product = avail_products[2]

In [175]:
import requests
from bs4 import BeautifulSoup
url = f"https://e4ftl01.cr.usgs.gov/GEDI/{product}/"

In [282]:
l3l4_products = ["GEDI_L3_LandSurface_Metrics_V2","GEDI_L4A_AGB_Density_V2_1","GEDI_L4B_Gridded_Biomass"]
product = l3l4_products[0]
url = f"https://daac.ornl.gov/daacdata/gedi/{product}/"

In [232]:
url

'https://daac.ornl.gov/daacdata/gedi/GEDI_L4A_AGB_Density_V2_1/'

In [233]:
response = requests.get(url)
next_level_links = []
valid=False
if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser')
    links = soup.find_all('a', href=True)  # find all <a> elements with href attribute
    for link in links:
        if valid:
            next_level_links.append(url+link['href'])
        if link.get_text() == "Parent Directory":
            valid=True
else:
    print("Failed to retrieve directory listing")

In [234]:
next_level_links

['https://daac.ornl.gov/daacdata/gedi/GEDI_L4A_AGB_Density_V2_1/comp/',
 'https://daac.ornl.gov/daacdata/gedi/GEDI_L4A_AGB_Density_V2_1/data/']

In [235]:
direct_download_links = []
def get_file_dict(url):
    response = requests.get(url)

    if response.status_code == 200:
        valid=False
        soup = BeautifulSoup(response.content, 'html.parser')
        file_dict = {}
        links = soup.find_all('a', href=True)
        for link in links:
            href = link['href']
            if link.get_text() == "Parent Directory":
                valid=True
            elif valid:
                if href.endswith('/'):
                    subdir_url = url + href
                    subdir_dict = get_file_dict(subdir_url)  # recursively call get_file_dict for subdirectory
                    file_dict.update(subdir_dict)
                else:
                    direct_download_links.append(url + href)
                    file_dict[link.text] = url + href
        return file_dict
    else:
        print(f"Failed to retrieve directory listing for {url}")
        return {}

In [236]:
import tqdm

In [237]:
for l in tqdm.tqdm(next_level_links):
    day_file_dict = get_file_dict(l)
    #print(len(day_file_dict))

100%|███████████████████████████████████| 2/2 [00:30<00:00, 15.08s/it]


In [214]:

#product_file_dict = get_file_dict(url)

In [238]:
len(direct_download_links)

70701

In [None]:
direct_download_links

In [240]:
today_s = datetime.datetime.now().date().strftime("%m-%d-%Y")
init_inventory = pd.DataFrame({'file_location':direct_download_links,'cache':'no','last_check':''})

In [241]:
init_inventory

Unnamed: 0,file_location,cache,last_check
0,https://daac.ornl.gov/daacdata/gedi/GEDI_L4A_A...,no,
1,https://daac.ornl.gov/daacdata/gedi/GEDI_L4A_A...,no,
2,https://daac.ornl.gov/daacdata/gedi/GEDI_L4A_A...,no,
3,https://daac.ornl.gov/daacdata/gedi/GEDI_L4A_A...,no,
4,https://daac.ornl.gov/daacdata/gedi/GEDI_L4A_A...,no,
...,...,...,...
70696,https://daac.ornl.gov/daacdata/gedi/GEDI_L4A_A...,no,
70697,https://daac.ornl.gov/daacdata/gedi/GEDI_L4A_A...,no,
70698,https://daac.ornl.gov/daacdata/gedi/GEDI_L4A_A...,no,
70699,https://daac.ornl.gov/daacdata/gedi/GEDI_L4A_A...,no,


In [242]:
init_inventory.to_csv(f'../../data/gedi/inventory_{product}_latest.csv',index=False)

### GEDI finder
https://git.earthdata.nasa.gov/projects/LPDUR/repos/gedi-finder-tutorial-python/browse/GEDI_Finder.py

In [2]:
import requests as r
from datetime import datetime
import os

In [4]:
##################################### Define Function to Query CMR ################################
def gedi_finder(product, bbox):
    
    # Define the base CMR granule search url, including LPDAAC provider name and max page size (2000 is the max allowed)
    cmr = "https://cmr.earthdata.nasa.gov/search/granules.json?pretty=true&provider=LPDAAC_ECS&page_size=2000&concept_id="
    
    # Set up dictionary where key is GEDI shortname + version and value is CMR Concept ID
    concept_ids = {'GEDI01_B.002': 'C1908344278-LPDAAC_ECS', 
                   'GEDI02_A.002': 'C1908348134-LPDAAC_ECS', 
                   'GEDI02_B.002': 'C1908350066-LPDAAC_ECS'}
    
    # CMR uses pagination for queries with more features returned than the page size
    page = 1
    bbox = bbox.replace(' ', '')  # Remove any white spaces
    try:
        # Send GET request to CMR granule search endpoint w/ product concept ID, bbox & page number, format return as json
        cmr_response = r.get(f"{cmr}{concept_ids[product]}&bounding_box={bbox}&pageNum={page}").json()['feed']['entry']
        
        # If 2000 features are returned, move to the next page and submit another request, and append to the response
        while len(cmr_response) % 2000 == 0:
            page += 1
            cmr_response += r.get(f"{cmr}{concept_ids[product]}&bounding_box={bbox}&pageNum={page}").json()['feed']['entry']
        
        # CMR returns more info than just the Data Pool links, below use list comprehension to return a list of DP links
        return [c['links'][0]['href'] for c in cmr_response]
    except:
        # If the request did not complete successfully, print out the response from CMR
        print(r.get(f"{cmr}{concept_ids[product]}&bounding_box={bbox.replace(' ', '')}&pageNum={page}").json())
        
################################ Execute GEDI Finder Function #####################################
# User-provided inputs (UPDATE FOR YOUR DESIRED PRODUCT AND BOUNDING BOX REGION OF INTEREST)
product = 'GEDI01_B.002'           # Options include 'GEDI01_B.002', 'GEDI02_A.002', 'GEDI02_B.002'
#bbox = '-73.65,-12.64,-47.81,9.7'  # bounding box coordinates in LL Longitude, LL Latitude, UR Longitude, UR Latitude format
bbox = '-76.964657,38.978967,-76.928008,39.002783'
# Call the gedi_finder function using the user-provided inputs
granules = gedi_finder(product, bbox)
print(f"{len(granules)} {product} Version 2 granules found.")

#################################### Export Results ###############################################
# Set up output textfile name using the current datetime
outName = f"{product.replace('.', '_')}_GranuleList_{datetime.now().strftime('%Y%m%d%H%M%S')}.txt"

# Open file and write each granule link on a new line
with open(outName, "w") as gf:
    for g in granules:
        gf.write(f"{g}\n")

24 GEDI01_B.002 Version 2 granules found.


In [5]:
#retrieve stac information

# User-provided inputs (UPDATE FOR YOUR DESIRED PRODUCT AND BOUNDING BOX REGION OF INTEREST)
product = 'GEDI01_B.002'           # Options include 'GEDI01_B.002', 'GEDI02_A.002', 'GEDI02_B.002'
#bbox = '-73.65,-12.64,-47.81,9.7'  # bounding box coordinates in LL Longitude, LL Latitude, UR Longitude, UR Latitude format
bbox = '-76.964657,38.978967,-76.928008,39.002783'

# Define the base CMR granule search url, including LPDAAC provider name and max page size (2000 is the max allowed)
cmr = "https://cmr.earthdata.nasa.gov/search/granules.json?pretty=true&provider=LPDAAC_ECS&page_size=2000&concept_id="

# Set up dictionary where key is GEDI shortname + version and value is CMR Concept ID
concept_ids = {'GEDI01_B.002': 'C1908344278-LPDAAC_ECS', 
               'GEDI02_A.002': 'C1908348134-LPDAAC_ECS', 
               'GEDI02_B.002': 'C1908350066-LPDAAC_ECS'}
# CMR uses pagination for queries with more features returned than the page size
page = 1
bbox = bbox.replace(' ', '')  # Remove any white spaces
# Send GET request to CMR granule search endpoint w/ product concept ID, bbox & page number, format return as json
cmr_response = r.get(f"{cmr}{concept_ids[product]}&bounding_box={bbox}&pageNum={page}",headers={'Accept':"application/json; profile=stac-catalogue"})


In [None]:
cmr_response.json()['feed']['entry']

## Inventory check

In [298]:
["GEDI01_B.002","GEDI02_A.002","GEDI02_B.002"]

['GEDI01_B.002', 'GEDI02_A.002', 'GEDI02_B.002']

In [None]:
["GEDI_L3_LandSurface_Metrics_V2","GEDI_L4A_AGB_Density_V2_1","GEDI_L4B_Gridded_Biomass"]

In [436]:
product = "GEDI02_B.002"

In [437]:
inventory = pd.read_csv(f'../../data/gedi/inventory_{product}_latest.csv')

In [438]:
local_cache_root = "../../../daac_data_download_python/data/"

In [439]:
product

'GEDI02_B.002'

In [440]:
def check_file(x,product):
    fp = local_cache_root+product+x.split(product)[1]
    #print(fp)
    return 'yes' if os.path.isfile(fp) else 'no'

In [390]:
def check_file(x,product):
    fp = local_cache_root+product+x.split("gedi/"+product)[1]
    #print(fp)
    return 'yes' if os.path.isfile(fp) else 'no'

In [448]:
inventory['cache'] =inventory.file_location.map(lambda x:check_file(x,product))

In [449]:
today_s = datetime.datetime.now().date().strftime("%m-%d-%Y")
inventory.loc[inventory.cache=="yes",'last_check']=today_s

In [450]:
inventory

Unnamed: 0,file_location,cache,last_check
0,https://e4ftl01.cr.usgs.gov/GEDI/GEDI02_B.002/...,yes,05-02-2023
1,https://e4ftl01.cr.usgs.gov/GEDI/GEDI02_B.002/...,yes,05-02-2023
2,https://e4ftl01.cr.usgs.gov/GEDI/GEDI02_B.002/...,yes,05-02-2023
3,https://e4ftl01.cr.usgs.gov/GEDI/GEDI02_B.002/...,yes,05-02-2023
4,https://e4ftl01.cr.usgs.gov/GEDI/GEDI02_B.002/...,yes,05-02-2023
...,...,...,...
209569,https://e4ftl01.cr.usgs.gov/GEDI/GEDI02_B.002/...,yes,05-02-2023
209570,https://e4ftl01.cr.usgs.gov/GEDI/GEDI02_B.002/...,yes,05-02-2023
209571,https://e4ftl01.cr.usgs.gov/GEDI/GEDI02_B.002/...,yes,05-02-2023
209572,https://e4ftl01.cr.usgs.gov/GEDI/GEDI02_B.002/...,yes,05-02-2023


In [451]:
inventory.cache.value_counts()

yes    209574
Name: cache, dtype: int64

In [452]:
inventory.to_csv(f'../../data/gedi/inventory_{product}_latest.csv',index=False)

## Cache to local


In [86]:
# Load necessary packages into Python
from subprocess import Popen
from getpass import getpass
from netrc import netrc
import argparse
import time
import os
import requests

In [None]:
# ----------------------------------USER-DEFINED VARIABLES--------------------------------------- #
# Set up command line arguments
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-dir', '--directory', required=True, help='Specify directory to save files to')
parser.add_argument('-f', '--files', required=True, help='A single granule URL, or the location of csv or textfile containing granule URLs')
args = parser.parse_args()

saveDir = args.directory  # Set local directory to download to
files = args.files        # Define file(s) to download from the LP DAAC Data Pool
prompts = ['Enter NASA Earthdata Login Username \n(or create an account at urs.earthdata.nasa.gov): ',
           'Enter NASA Earthdata Login Password: ']

In [None]:
# ---------------------------------SET UP WORKSPACE---------------------------------------------- #
# Create a list of files to download based on input type of files above
if files.endswith('.txt') or files.endswith('.csv'):
    fileList = open(files, 'r').readlines()  # If input is textfile w file URLs
elif isinstance(files, str):
    fileList = [files]                       # If input is a single file

# Generalize download directory
if saveDir[-1] != '/' and saveDir[-1] != '\\':
    saveDir = saveDir.strip("'").strip('"') + os.sep

In [89]:
urs = 'urs.earthdata.nasa.gov'    # Address to call for authentication

# --------------------------------AUTHENTICATION CONFIGURATION----------------------------------- #
# Determine if netrc file exists, and if so, if it includes NASA Earthdata Login Credentials
try:
    netrcDir = os.path.expanduser("~/.netrc")
    netrc(netrcDir).authenticators(urs)[0]

# Below, create a netrc file and prompt user for NASA Earthdata Login Username and Password
except FileNotFoundError:
    homeDir = os.path.expanduser("~")
    Popen('touch {0}.netrc | chmod og-rw {0}.netrc | echo machine {1} >> {0}.netrc'.format(homeDir + os.sep, urs), shell=True)
    Popen('echo login {} >> {}.netrc'.format(getpass(prompt=prompts[0]), homeDir + os.sep), shell=True)
    Popen('echo password {} >> {}.netrc'.format(getpass(prompt=prompts[1]), homeDir + os.sep), shell=True)

# Determine OS and edit netrc file if it exists but is not set up for NASA Earthdata Login
except TypeError:
    homeDir = os.path.expanduser("~")
    Popen('echo machine {1} >> {0}.netrc'.format(homeDir + os.sep, urs), shell=True)
    Popen('echo login {} >> {}.netrc'.format(getpass(prompt=prompts[0]), homeDir + os.sep), shell=True)
    Popen('echo password {} >> {}.netrc'.format(getpass(prompt=prompts[1]), homeDir + os.sep), shell=True)

# Delay for up to 1 minute to allow user to submit username and password before continuing
tries = 0
while tries < 30:
    try:
        netrc(netrcDir).authenticators(urs)[2]
    except:
        time.sleep(2.0)
    tries += 1

In [93]:
from pathlib import Path

In [295]:
tasks = inventory.query('cache=="no"')

In [296]:
fileList = tasks.file_location[0:10000].tolist()

In [108]:
import warnings
warnings.filterwarnings("ignore")

In [297]:
fileList

['https://daac.ornl.gov/daacdata/gedi/GEDI_L3_LandSurface_Metrics_V2/comp/GEDI_L3_LandSurface_Metrics_V2.pdf']

In [275]:
f = fileList[0]
saveName = local_cache_root+product+f.split(product)[1]
if not Path(saveName).parent.exists():
    Path(saveName).parent.mkdir(parents=True)

# Create and submit request and download file
with requests.get(f.strip(), verify=True, stream=True, auth=(netrc(netrcDir).authenticators(urs)[0], netrc(netrcDir).authenticators(urs)[2])) as response:
    if response.status_code != 200:
        print(response.status_code)
        print("{} not downloaded. Verify that your username and password are correct in {}".format(f.split('/')[-1].strip(), netrcDir))
    else:
        response.raw.decode_content = True
        content = response.raw
        with open(saveName, 'wb') as d:
            while True:
                chunk = content.read(16 * 1024)
                if not chunk:
                    break
                d.write(chunk)

401
GEDI_ATBD_L4A_v1.0.pdf not downloaded. Verify that your username and password are correct in /home/liuz/.netrc


In [278]:
#!/usr/bin/python
from http.cookiejar import CookieJar
from urllib.parse import urlencode
 
import urllib.request as urllib
 
# The user credentials that will be used to authenticate access to the data
 
username = netrc(netrcDir).authenticators(urs)[0]
password = netrc(netrcDir).authenticators(urs)[2]

# The url of the file we wish to retrieve

#url = "http://e4ftl01.cr.usgs.gov/MOLA/MYD17A3H.006/2009.01.01/MYD17A3H.A2009001.h12v05.006.2015198130546.hdf.xml"

# Create a password manager to deal with the 401 reponse that is returned from
# Earthdata Login
 
password_manager = urllib.HTTPPasswordMgrWithDefaultRealm()
password_manager.add_password(None, "https://urs.earthdata.nasa.gov", username, password)
 
# Create a cookie jar for storing cookies. This is used to store and return
# the session cookie given to use by the data server (otherwise it will just
# keep sending us back to Earthdata Login to authenticate).  Ideally, we
# should use a file based cookie jar to preserve cookies between runs. This
# will make it much more efficient.
 
cookie_jar = CookieJar()
  

# Install all the handlers.
 
opener = urllib.build_opener(
    urllib.HTTPBasicAuthHandler(password_manager),
    #urllib2.HTTPHandler(debuglevel=1),    # Uncomment these two lines to see
    #urllib2.HTTPSHandler(debuglevel=1),   # details of the requests/responses
    urllib.HTTPCookieProcessor(cookie_jar))
urllib.install_opener(opener)

# Create and submit the request. There are a wide range of exceptions that
# can be thrown here, including HTTPError and URLError. These should be
# caught and handled.

request = urllib.Request(url)
response = urllib.urlopen(request)



if response.getcode() != 200:
    print("{} not downloaded. Verify that your username and password are correct in {}".format(f.split('/')[-1].strip(), netrcDir))
else:
    content = response.read()
    with open(saveName, 'wb') as d:
        d.write(content)

In [None]:
## migrated to python script for parallelization
# -----------------------------------------DOWNLOAD FILE(S)-------------------------------------- #
# Loop through and download all files to the directory specified above, and keeping same filenames
for f in tqdm.tqdm(fileList):
    saveName = local_cache_root+product+f.split(product)[1]
    if not Path(saveName).parent.exists():
        Path(saveName).parent.mkdir()

    # Create and submit request and download file
    with requests.get(f.strip(), verify=False, stream=True, auth=(netrc(netrcDir).authenticators(urs)[0], netrc(netrcDir).authenticators(urs)[2])) as response:
        if response.status_code != 200:
            print("{} not downloaded. Verify that your username and password are correct in {}".format(f.split('/')[-1].strip(), netrcDir))
        else:
            response.raw.decode_content = True
            content = response.raw
            with open(saveName, 'wb') as d:
                while True:
                    chunk = content.read(16 * 1024)
                    if not chunk:
                        break
                    d.write(chunk)
            #print('Downloaded file: {}'.format(saveName))
    #time.sleep(1.0)

In [353]:
import datetime

In [355]:
print(datetime.datetime.now())

2023-04-26 16:16:48.606545
