# Collection of geospatial data and STAC - GEDI 

In [None]:
import os
from datetime import datetime

import pandas as pd
import requests
import tqdm
from bs4 import BeautifulSoup

## Manage inventory

### Initialize inventory
There are four levels of GEDI data, with the Web Interface from two sites:

Level 1 and Level 2 : https://e4ftl01.cr.usgs.gov/GEDI/

Level 3 and Level 4 : https://daac.ornl.gov/daacdata/gedi/

In [None]:
# generate base url for L1/L2 products
avail_products = ["GEDI01_B.002", "GEDI02_A.002", "GEDI02_B.002"]
product = avail_products[2]
url = f"https://e4ftl01.cr.usgs.gov/GEDI/{product}/"

In [None]:
# generate base url for L3/L4 products
l3l4_products = [
    "GEDI_L3_LandSurface_Metrics_V2",
    "GEDI_L4A_AGB_Density_V2_1",
    "GEDI_L4A_AGB_Density_GW",
    "GEDI_L4B_Gridded_Biomass",
]
product = l3l4_products[2]
url = f"https://daac.ornl.gov/daacdata/gedi/{product}/"

In [None]:
# collection the sublevel directory URL into next_level_links
response = requests.get(url)
next_level_links = []
valid = False
if response.status_code == 200:
    soup = BeautifulSoup(response.content, "html.parser")
    links = soup.find_all("a", href=True)  # find all <a> elements with href attribute
    for link in links:
        if valid:
            next_level_links.append(url + link["href"])
        if link.get_text() == "Parent Directory":
            valid = True
else:
    print("Failed to retrieve directory listing")

In [None]:
next_level_links

In [None]:
# function to recurisively collection download links for individual files into direct_download_links
direct_download_links = []


def get_file_dict(url):
    response = requests.get(url)

    if response.status_code == 200:
        valid = False
        soup = BeautifulSoup(response.content, "html.parser")
        file_dict = {}
        links = soup.find_all("a", href=True)
        for link in links:
            href = link["href"]
            if link.get_text() == "Parent Directory":
                valid = True
            elif valid:
                if href.endswith("/"):
                    subdir_url = url + href
                    subdir_dict = get_file_dict(
                        subdir_url
                    )  # recursively call get_file_dict for subdirectory
                    file_dict.update(subdir_dict)
                else:
                    direct_download_links.append(url + href)
                    file_dict[link.text] = url + href
        return file_dict
    else:
        print(f"Failed to retrieve directory listing for {url}")
        return {}

In [None]:
# gather all direct_download_links
for l in tqdm.tqdm(next_level_links):
    day_file_dict = get_file_dict(l)
    # print(len(day_file_dict))

In [None]:
# store the download links as initial inventory database (as a csv table)
# this table will be used to track the status of download when there is an interrupt.
today_s = datetime.datetime.now().date().strftime("%m-%d-%Y")
init_inventory = pd.DataFrame(
    {"file_location": direct_download_links, "cache": "no", "last_check": ""}
)
init_inventory.to_csv(f"../../data/gedi/inventory_{product}_latest.csv", index=False)

### Data collection in background

Two separate scripts to run for L1L2 and L3L4 since the authentication process are different.

- Usage

To download product of Level 1B, 2A, and 2B, run `pyScripts/daac_pool.py -p [1B|2A|2B]`

To download product of Level 3 and 4, run `pyScripts/daac_session.py -p [3|4A|4A_GW|4B]`

- Note

The script will utilize multi-processing. Change `num_threads = 16` for another number of cores to use.
Authentication file needed at location `~/.netrc`, with format:
The script will only download the file that is marked as `no` in the inventory table `/data/gedi/inventory_[collection_short_name]_latest.csv`

<pre><code>
machine urs.earthdata.nasa.gov
login USERNAME
password PASSWORD
</code></pre>

### Inventory check

Available collection name

"GEDI01_B.002",
"GEDI02_A.002",
"GEDI02_B.002",
"GEDI_L3_LandSurface_Metrics_V2",
"GEDI_L4A_AGB_Density_V2_1",
"GEDI_L4A_AGB_Density_GW_2028",
"GEDI_L4B_Gridded_Biomass"

In [None]:
product = "GEDI01_B.002"
inventory = pd.read_csv(f"../../data/gedi/inventory_{product}_latest.csv")
local_cache_root = "../../../daac_data_download_python/data/"

In [None]:
# ALERT!! for level 1 and level 2
def check_file(x, product):
    fp = local_cache_root + product + x.split(product)[1]
    # print(fp)
    return "yes" if os.path.isfile(fp) else "no"

In [None]:
# ALERT!! for level 3 and level 4
def check_file(x, product):
    fp = local_cache_root + product + x.split("gedi/" + product)[1]
    # print(fp)
    return "yes" if os.path.isfile(fp) else "no"

In [None]:
inventory["cache"] = inventory.file_location.map(lambda x: check_file(x, product))
today_s = datetime.datetime.now().date().strftime("%m-%d-%Y")
inventory.loc[inventory.cache == "yes", "last_check"] = today_s

In [None]:
# show number of finished and todo
inventory.cache.value_counts()

In [None]:
# update the database
inventory.to_csv(f"../../data/gedi/inventory_{product}_latest.csv", index=False)

## STAC information
Reference document: https://cmr.earthdata.nasa.gov/search/site/docs/search/api.html#stac

### List collections

In [None]:
gedi_product_query = "https://cmr.earthdata.nasa.gov/search/collections?short_name=GEDI*&options[short_name][pattern]=true"

In [None]:
cmr_response = requests.get(
    f"{gedi_product_query}", headers={"Accept": "application/json"}
).json()

In [None]:
collection_metadata = pd.DataFrame(
    [(x["id"], x["short_name"], x["title"]) for x in cmr_response["feed"]["entry"]],
    columns=["id", "short_name", "title"],
)

In [None]:
collection_metadata

In [None]:
# chatgpt prompt input
"""
Write python code in jupyter cell: given a list of product `ids`, 
for each `id` in the list, make a directory under `data/gedi/` 
with the name of `id` and save the return json from 
the url `https://cmr.earthdata.nasa.gov/search/concepts/{id}.stac`
"""
import json

# List of product ids
ids = collection_metadata.id.tolist()

base_directory = "../../data/gedi/stac/"

# Create the base directory if it doesn't exist
os.makedirs(base_directory, exist_ok=True)

for id in ids:
    directory_path = os.path.join(base_directory, str(id))
    os.makedirs(directory_path, exist_ok=True)

    url = f"https://cmr.earthdata.nasa.gov/search/concepts/{id}.stac"
    response = requests.get(url)

    if response.status_code == 200:
        json_data = response.json()
        file_path = os.path.join(directory_path, f"{id}.json")

        with open(file_path, "w") as file:
            json.dump(json_data, file, indent=4)

        print(f"JSON data saved for id: {id}")
    else:
        print(f"Failed to retrieve JSON data for id: {id}")

### List items
inspired by [GEDI finder code](https://git.earthdata.nasa.gov/projects/LPDUR/repos/gedi-finder-tutorial-python/browse/GEDI_Finder.py)
to iteratively collect item json

In [None]:
concept_ids = dict(
    zip(
        collection_metadata.short_name,
        collection_metadata.id,
    )
)

In [None]:
concept_ids

In [None]:
product = "GEDI_L4B_Gridded_Biomass_2017"
concept_ids[product]

In [None]:
# Define the base CMR granule search url, including LPDAAC provider name and max page size (2000 is the max allowed)
cmr = "https://cmr.earthdata.nasa.gov/search/granules.stac?pretty=true&page_size=2000&collection_concept_id="
# CMR uses pagination for queries with more features returned than the page size
page = 1

try:

    # Send GET request to CMR granule search endpoint w/ product concept ID, bbox & page number, format return as json
    cmr_response = requests.get(f"{cmr}{concept_ids[product]}&pageNum={page}").json()[
        "features"
    ]

    print("fetched page", page)
    # If 2000 features are returned, move to the next page and submit another request, and append to the response
    while len(cmr_response) % 2000 == 0:
        page += 1
        cmr_response += requests.get(
            f"{cmr}{concept_ids[product]}&pageNum={page}"
        ).json()["features"]
        print("fetched page", page)
except:
    # If the request did not complete successfully, print out the response from CMR
    print(requests.get(f"{cmr}{concept_ids[product]}&pageNum={page}").json())

In [None]:
# store each item into a separate json file
item_dir = f"../../data/gedi/stac/{concept_ids[product]}/items/"
os.makedirs(item_dir, exist_ok=True)

for item in tqdm.tqdm(cmr_response):
    item_id = item["id"]
    file_path = os.path.join(item_dir, f"{item_id}.json")
    with open(file_path, "w") as file:
        json.dump(item, file, indent=4)