In [None]:
!pip install contextily~=1.3.0
!pip install geopandas~=0.13.2
!pip install folium~=0.14.0
!pip install h5py~=3.8.0
!pip install harmony-py~=0.4.9
!pip install IPython~=8.12.2
!pip install matplotlib~=3.7.1
!pip install matplotlib_scalebar~=0.8.1
!pip install netCDF4~=1.6.4
!pip install numpy~=1.24.3
!pip install owslib~=0.29.2
!pip install pandas~=2.0.2
!pip install pydap~=3.4.1
!pip install rasterio~=1.3.7
!pip install rasterstats~=0.19.0
!pip install requests~=2.31.0
!pip install s3fs~=2023.6.0
!pip install seaborn~=0.12.2
!pip install scipy~=1.10.1
!pip install shapely~=2.0.1
!pip install tabulate~=0.9.0
!pip install mlxtend
!pip install plotly
!pip install scikit-learn

In [2]:
# import modules
%matplotlib inline
import os
import traceback
from os import path
from glob import glob
import requests
import datetime as dt
import pandas as pd
import geopandas as gpd
import contextily as ctx
import netCDF4 as nc
from matplotlib import pyplot as plt
from pydap.cas.urs import setup_session
from shapely.ops import orient
from IPython import display
from requests.adapters import HTTPAdapter, Retry
import seaborn as sns
sns.set(style='whitegrid')
pd.set_option('mode.chained_assignment', None)
import warnings
warnings.filterwarnings('ignore')

#### Add the folder path of where the .shp resides in `input_shape_file_path` variable
#### Add the filename of .shp file in `input_shape_file_name` variable

In [3]:
input_shape_file_path = "/kaggle/input/nc-forests-all-time"
input_shape_file_name = "NC_forests_all_time.shp"
os.makedirs(input_shape_file_path, exist_ok=True)

#### Add the year you are interested in `year_` variable (We downloaded data between 2018 - 2021)
#### Add the folder path where **Gedi L4A** should be downloaded in `outdir_folder` variable

In [4]:
# Creating the path to the GEDI data
year_ = 2021
outdir_folder = "/kaggle/working/BulkDownload"
outdir_output = os.path.join(outdir_folder, "agbd_output/{}".format(year_))
os.makedirs(outdir_output, exist_ok=True)

#### Add your Earthdata **username** and **password**

In [5]:
# Setting up the Earthdata URS Login session.
# Get account here https://urs.earthdata.nasa.gov/
# Replace username and password with your credentials

username = "MyEarthDataUsername"
password= "MyEarthDataPassword"

In [6]:
# variables of interests
variables = ['agbd', 'l4_quality_flag', 'degrade_flag', 'agbd_se', 'land_cover_data/pft_class', 'solar_elevation']

# beams of interest
beams = ['BEAM0000', 'BEAM0001', 'BEAM0010', 'BEAM0011', 'BEAM0101', 'BEAM0110', 'BEAM1000', 'BEAM1011']

In [7]:
headers = ['lat_lowestmode', 'lon_lowestmode', 'elev_lowestmode', 'shot_number']
headers.extend(variables)

In [8]:
def search_all_data(aca, index_):
  doi = '10.3334/ORNLDAAC/2056'
  # CMR API base url
  cmrurl='https://cmr.earthdata.nasa.gov/search/'
  doisearch = f"{cmrurl}collections.json?doi={doi}"
  concept_id = requests.get(doisearch).json()['feed']['entry'][0]['id']

  # defining geojson


  geojson = {"shapefile": ("aca.geojson", aca.geometry.to_json(), "application/geo+json")}
  page_num = 1
  page_size = 2000 # CMR page size limit. do not change this. it is maxed out

  start_date = dt.datetime(year_, 1, 1) # specify your own start date
  end_date = dt.datetime(year_, 12, 31)  # specify your end start date

  # CMR formatted start and end times
  dt_format = '%Y-%m-%dT%H:%M:%SZ'
  temporal_str = start_date.strftime(dt_format) + ',' + end_date.strftime(dt_format)

  opendap_arr = []

  while True:

      # defining parameters
      cmr_param = {
          "collection_concept_id": concept_id,
          "temporal": temporal_str,
          "page_size": page_size,
          "page_num": page_num,
          "simplify-shapefile": 'true' # this is needed to bypass 5000 coordinates limit of CMR
      }

      granulesearch = f"{cmrurl}granules.json"
      response = requests.post(granulesearch, data=cmr_param, files=geojson)
      granules = response.json()['feed']['entry']
      if granules:
          for g in granules:
              # Get OPeNDAP URLs
              for links in g['links']:
                  if 'title' in links and links['title'].startswith('OPeNDAP'):
                      opendap_url = links['href']
                      opendap_arr.append(opendap_url)
          page_num += 1
      else:
          break

  total_granules = len(opendap_arr)
  print ("Total granules found: ", total_granules)

  return {index_: opendap_arr}

In [9]:
def download_data(opendap_map, aca):
  session = setup_session(username, password, check_url="https://opendap.earthdata.nasa.gov/")
  index_ = list(opendap_map.keys())[0]
  opendap_arr = opendap_map[index_]
  csv_file_name = "{}.csv".format(index_)
  out_csv = os.path.join(outdir_output,csv_file_name)

  with open(out_csv, "w") as f:
      f.write(','.join(headers)+'\n')
  retries = Retry(total=50, backoff_factor=0.1, status_forcelist=[ 500, 502, 503, 504 ])
  session.mount('https://', HTTPAdapter(max_retries=retries))

  c=0
  for g_name in opendap_arr:
      c += 1
      # loop over all beams
      for beam in beams:
          # 1. Retrieving lat, lon coordinates for the file
          hyrax_url = f"{g_name}.dap.nc4?dap4.ce=/{beam}/lon_lowestmode;/{beam}/lat_lowestmode"
          r = session.get(hyrax_url)
          if (r.status_code != 400):
              ds = nc.Dataset('hyrax', memory=r.content)
              lat = ds[beam]['lat_lowestmode'][:]
              lon = ds[beam]['lon_lowestmode'][:]
              ds.close()
              df = pd.DataFrame({'lat_lowestmode': lat, 'lon_lowestmode': lon}) # creating pandas dataframe

              # 2. Subsetting by bounds of the area of interest
              # converting to geopandas dataframe
              gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.lon_lowestmode, df.lat_lowestmode))
              gdf_aca = gdf[gdf['geometry'].within(aca.geometry[index_])]
              if not gdf_aca.empty:
                  # creating empty columns for variables
                  for v in headers[2:]:
                      gdf_aca[v] = None
                  # 3. retrieving variables of interest, agbd, agbd_t in this case.
                  # We are only retriving the shots within subset area.
                  for _, df_gr in gdf_aca.groupby((gdf_aca.index.to_series().diff() > 1).cumsum()):
                      i = df_gr.index.min()
                      j = df_gr.index.max()
                      for v in headers[2:]:
                          var_s = f"/{beam}/{v}%5B{i}:{j}%5D"
                          hyrax_url = f"{g_name}.dap.nc4?dap4.ce={var_s}"
                          r = session.get(hyrax_url)
                          if (r.status_code != 400):
                              ds = nc.Dataset('hyrax', memory=r.content)
                              gdf_aca.loc[i:j, (v)] = ds[beam][v][:]
                              ds.close()

                  # saving the output file
                  gdf_aca.to_csv(out_csv, mode='a', index=False, header=False, columns=headers)


In [10]:
def search_and_download(shape_file_data_frame):
  try:
    df = shape_file_data_frame[0]
    index_ = shape_file_data_frame[1]
    new_df = df.copy()
    new_df.crs = "EPSG:4326"
    new_df.geometry = new_df.geometry.apply(orient, args=(1,))
    opendap_map = search_all_data(new_df, index_)
    download_data(opendap_map, new_df)
    print("Finished downloading {}".format(index_))
    return 0
  except Exception:
    # there will definitely be errors while downloading from opendap servers. Ignore them
    print("Error in downloading {}".format(index_))
    traceback.print_exc()
    return 1

In [11]:
# read the data from .shp file. Essentialy we want to download L4A data for the regions in the shape file
all_data = gpd.read_file(os.path.join(input_shape_file_path, input_shape_file_name))
all_data.crs = "EPSG:4326"
all_data.geometry = all_data.geometry.apply(orient, args=(1,))

#### Define the number of processes to use in order to download data from opendap in `no_of_prc` variable
#### Every patch/region from the shape file will be a seperate process/download request to the opendap server
#### **Note - This download will take a while (days) since we need to download data for every patch. Hence (what we did) we split this task and created multiple notebooks/user accounts which download different patches by manipulating `start` and `end` variable**

In [None]:
from multiprocessing import Pool
no_of_prc = 20
start = 0
end = len(all_data)
matrix = [(all_data.iloc[i:i+1].copy(), i) for i in range(start, end)]

In [None]:
if __name__ == '__main__':
  with Pool(no_of_prc) as p:
      p.map(search_and_download, matrix)