# imports of functions

In [1]:
# author: Philipp Arndt, UC San Diego / Scripps Institution of Oceanography
# 
# intended for use on OSG OSPool, called in run_py.sh, which is called in a submit file 
# submit file is based on a granule list queried locally in make_granule_list.ipynb 
# see examples for submit files in: HTCondor_submit/ 
# see examples for granule lists in:  granule_lists/
# 
# run locally with: 
# $ conda activate icelakes-env
# $ python3 detect_lakes.py --granule <granule_producer_id> --polygon geojsons/<polygon_name.geojson>
# 
# a call that returns a bunch of lakes
# $ python3 detect_lakes.py --granule ATL03_20220714010847_03381603_006_02.h5 --polygon geojsons/simplified_GRE_2500_CW.geojson

import argparse
import os
import gc
import sys
import pickle
import subprocess
import traceback
import numpy as np
import icelakes
from icelakes.utilities import encedc, decedc, get_size
from icelakes.nsidc import download_granule, edc
from icelakes.detection import read_atl03, detect_lakes, melt_lake

parser = argparse.ArgumentParser(description='Test script to print some stats for a given ICESat-2 ATL03 granule.')
parser.add_argument('--granule', type=str, default='ATL03_20220714010847_03381603_006_02.h5',
                    help='The producer_id of the input ATL03 granule')
parser.add_argument('--polygon', type=str, default='geojsons/simplified_GRE_2000_CW.geojson',
                    help='The file path of a geojson file for spatial subsetting') # geojsons/west_greenland.geojson
parser.add_argument('--is2_data_dir', type=str, default='IS2data',
                    help='The directory into which to download ICESat-2 granules')
parser.add_argument('--download_gtxs', type=str, default='all',
                    help='String value or list of gtx names to download, also accepts "all"')
parser.add_argument('--out_data_dir', type=str, default='detection_out_data',
                    help='The directory to which to write the output data')
parser.add_argument('--out_plot_dir', type=str, default='detection_out_plot',
                    help='The directory to which to write the output plots')
parser.add_argument('--out_stat_dir', type=str, default='detection_out_stat',
                    help='The directory to which to write the granule stats')
args = parser.parse_args("")

In [2]:
### default, this works
# args.granule = 'ATL03_20220714010847_03381603_006_02.h5'
# args.polygon = 'geojsons/simplified_GRE_2000_CW.geojson'

###  this one works too
# args.granule = 'ATL03_20200225170832_09310610_006_01.h5'
# args.polygon = 'geojsons/simplified_ANT_1500_East_D-Dp.geojson'

# ### 501 response (request URI too long???)
# args.granule = 'ATL03_20230206123048_07321811_006_01.h5'
# args.polygon = 'geojsons/simplified_ANT_1000_East_E-Ep.geojson'

### 501 response (No granules returned by CMR: 
### The polygon boundary points are listed in the wrong order. Points must be provided in counter-clockwise order.)
# args.granule = 'ATL03_20210208202116_07151010_006_01.h5'
# args.polygon = 'geojsons/simplified_ANT_1000_West_Ep-F.geojson'

# now this clips the shape weirdly
# args.granule = 'ATL03_20211111233437_07711312_006_01.h5' 
# args.polygon = 'geojsons/simplified_ANT_1000_East_E-Ep.geojson'

# args.granule = 'ATL03_20211202063100_10811310_006_01.h5'
# args.polygon = 'geojsons/simplified_ANT_1000_West_Ep-F.geojson'

# args.granule = 'ATL03_20221117060934_08781710_006_01.h5' 
# args.polygon = 'geojsons/simplified_ANT_1000_East_Jpp-K.geojson'

# args.granule = 'ATL03_20200803105046_05970803_006_01.h5'
# args.polygon = 'geojsons/simplified_GRE_2000_SE.geojson'

args.granule = 'ATL03_20190311211215_11210211_006_02.h5'
args.polygon = 'geojsons/simplified_ANT_1000_East_Jpp-K.geojson'

In [3]:
args

Namespace(granule='ATL03_20190311211215_11210211_006_02.h5', polygon='geojsons/simplified_ANT_1000_East_Jpp-K.geojson', is2_data_dir='IS2data', download_gtxs='all', out_data_dir='detection_out_data', out_plot_dir='detection_out_plot', out_stat_dir='detection_out_stat')

In [4]:
# try to figure out where the script is being executed (just to show those maps at conferences, etc...)
try:
    with open('location-wrapper.sh', 'rb') as file: script = file.read()
    geoip_out = subprocess.run(script, shell=True, capture_output=True)
    compute_latlon = str(geoip_out.stdout)[str(geoip_out.stdout).find('<x><y><z>')+9 : str(geoip_out.stdout).find('<z><y><x>')]
    print('\nThis job is running at the following lat/lon location:%s\n' % compute_latlon)
except:
    compute_latlon='0.0,0.0'
    print('\nUnable to determine compute location for this script.\n')

# # shuffling files around for HTCondor
# for thispath in (args.is2_data_dir, args.out_data_dir, args.out_plot_dir):
#     if not os.path.exists(thispath): os.makedirs(thispath)


This job is running at the following lat/lon location:32.8807000,-117.2359000



In [5]:
# download the specified ICESat-2 data from NSIDC
input_filename, request_status_code = download_granule(args.granule, args.download_gtxs, args.polygon, args.is2_data_dir, 
                                            decedc(edc().u), decedc(edc().p), vars_sub='default', spatial_sub=True)
#input_filename = 'IS2data/processed_ATL03_20200225170832_09310610_006_01.h5'
# input_filename = 'IS2data/processed_ATL03_20220714010847_03381603_006_02.h5'
# request_status_code = 200

--> parameters: granule_id = ATL03_20190311211215_11210211_006_02.h5
                gtxs = all
                geojson = geojsons/simplified_ANT_1000_East_Jpp-K.geojson
                granule_output_path = IS2data
                vars_sub = default
                spatial_sub = True


Downloading ICESat-2 data. Found granules:
  ATL03_20190311211215_11210211_006_02.h5, 1800.03 MB

Input geojson: geojsons/simplified_ANT_1000_East_Jpp-K.geojson
Simplified polygon coordinates based on geojson input: -49.55840928505104,-78.08002335541525,-50.82357776191365,-78.67622732607624,-50.69829536339721,-79.49569076709194,-53.858482681598204,-80.06357802036612,-54.930903242573834,-80.57292704632596,-54.82163272698139,-81.2533275662497,-59.577399080744044,-82.56871373720666,-65.67446611192638,-84.70735865518276,-63.06639098022328,-84.95852210141601,-56.389500887584,-84.58592510339423,-53.99472040990648,-83.60188391978124,-54.370559794091214,-83.1509981725118,-52.66795209678896,-83.07236510255494,-5

In [None]:
# import geopandas as gpd
# poly_nonsimplified = args.polygon.replace('simplified_', '')
# clip_shape = gpd.read_file(poly_nonsimplified)

# # first just slice to the bounding box
# lons = clip_shape.loc[0].geometry.exterior.coords.xy[0]
# lats = clip_shape.loc[0].geometry.exterior.coords.xy[1]
# print('%.5f,%.5f,%.5f,%.5f' % (np.min(lons), np.min(lats), np.max(lons), np.max(lats)))

In [6]:
%matplotlib widget
import geopandas as gpd
import pandas as pd
import h5py
import matplotlib.pyplot as plt
# input_filename = 'IS2data/processed_ATL03_20200225170832_09310610_006_01.h5'
gtx = 'gt2r'
gdf_poly = gpd.read_file(args.polygon)
poly_nonsimplified = args.polygon.replace('simplified_', '')
clip_shape = gpd.read_file(poly_nonsimplified)
dfs = [gdf_poly, clip_shape]
# dfs=[]
for gtx in ['gt1r', 'gt2r', 'gt3r']:
    with h5py.File(input_filename, 'r') as f:
        lon_ph = f[gtx]['heights']['lon_ph'][:]
        lat_ph = f[gtx]['heights']['lat_ph'][:]
        df = pd.DataFrame({'lat_ph': lat_ph, 'lon': lon_ph})
        df['lat'] = np.round(df.lat_ph,2)
        df_gt = df.groupby('lat')[['lon']].median()
        df_gt.reset_index(inplace=True)
        gdf_gt = gpd.GeoDataFrame(geometry=gpd.points_from_xy(df_gt.lon, df_gt.lat), crs="EPSG:4326")
        dfs.append(gdf_gt)

gdf = pd.concat(dfs)
gdf.explore()

In [8]:
# perform a bunch of checks to make sure everything went alright with the nsidc api
print('Request status code:', request_status_code, request_status_code==200)
if request_status_code != 200:
    print('NSIDC API request failed.')
    sys.exit(127)
if request_status_code==200:
    with open('success.txt', 'w') as f: print('we got some sweet data', file=f)
    if input_filename == 'none': 
        print('granule seems to be empty. nothing more to do here.') 
        sys.exit(69)
if os.path.exists(input_filename):
    if os.path.getsize(input_filename) < 31457280:# 30 MB
        print('granule seems to be empty. nothing more to do here.') 
        sys.exit(69)

Request status code: 200 True


In [9]:
os.path.getsize(input_filename)/1024**2
import geopandas as gpd
import pandas as pd
import h5py
import matplotlib.pyplot as plt

poly_nonsimplified = args.polygon.replace('simplified_', '')
clip_shape = gpd.read_file(poly_nonsimplified)

for gtx in ['gt1r', 'gt2r', 'gt3r', 'gt1l', 'gt2l', 'gt3l']:
    with h5py.File(input_filename, 'r') as f:
        lon_ph = f[gtx]['heights']['lon_ph'][:]
        lat_ph = f[gtx]['heights']['lat_ph'][:]
        h_ph = f[gtx]['heights']['h_ph'][:]
        df = pd.DataFrame({'lat': lat_ph, 'lon': lon_ph, 'h': h_ph})
        photon_data = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.lon, df.lat), crs="EPSG:4326")
        
        # first just slice to the bounding box
        # lons = clip_shape.loc[0].geometry.exterior.coords.xy[0]
        # lats = clip_shape.loc[0].geometry.exterior.coords.xy[1]
        # photon_data = photon_data[(photon_data.lon > np.min(lons)) & (photon_data.lon < np.max(lons)) &
        #                           (photon_data.lat > np.min(lats)) & (photon_data.lat < np.max(lats))]
            
        photon_data = gpd.clip(photon_data, clip_shape).reset_index(drop=True)
        df = pd.DataFrame(photon_data.drop(columns='geometry'))
        print(len(df))


491095
473132
487388
1986318
1569913
1896897


In [10]:
gtx_list, ancillary = read_atl03(input_filename, gtxs_to_read='none')

  reading in IS2data/processed_ATL03_20190311211215_11210211_006_02.h5
  reading in beam:  --> done.


In [11]:
# detect melt lakes
lake_list = []
granule_stats = [0,0,0,0]
for gtx in gtx_list:
    lakes_found, gtx_stats = detect_lakes(input_filename, gtx, args.polygon, verbose=False)
    for i in range(len(granule_stats)): granule_stats[i] += gtx_stats[i]
    lake_list += lakes_found
    del lakes_found, gtx_stats
    gc.collect()

  reading in IS2data/processed_ATL03_20190311211215_11210211_006_02.h5
  reading in beam: gt1l  --> done.

-----------------------------------------------------------------------------

PROCESSING GROUND TRACK: gt1l (strong)
---> finding flat surfaces in photon data (159 / 1003 were flat)
---> removing afterpulses, calculating photon densities & looking for second density peaks below the surface
(0 / 1003 pass lake quality test)
---> merging major frame segments that possibly represent lakes iteratively
   ---> nothing to merge.
  reading in IS2data/processed_ATL03_20190311211215_11210211_006_02.h5
  reading in beam: gt1r  --> done.

-----------------------------------------------------------------------------

PROCESSING GROUND TRACK: gt1r (weak)
---> finding flat surfaces in photon data (155 / 1003 were flat)
---> removing afterpulses, calculating photon densities & looking for second density peaks below the surface
(0 / 1003 pass lake quality test)
---> merging major frame segments 

In [12]:
granule_stats

[848116.2062980905, 0.0, 6862994, 0]

In [13]:
if granule_stats[0] > 0:
    with open('success.txt', 'w') as f: print('we got some data from NSIDC!!', file=f)
    print('Sucessfully retrieved data from NSIDC!!')
    
# print stats for granule
print('\nGRANULE STATS (length total, length lakes, photons total, photons lakes):%.3f,%.3f,%i,%i\n' % tuple(granule_stats))

# for each lake call the surrf algorithm for depth determination
# if it fails, just skip the lake, but print trackeback for the logs 
print('---> determining depth for each lake')
for i, lake in enumerate(lake_list):
    try: 
        lake.surrf()
        print('   --> %3i/%3i, %s | %8.3fN, %8.3fE: %6.2fm deep / quality: %8.2f' % (i+1, len(lake_list), lake.gtx, lake.lat, 
                                                                                 lake.lon, lake.max_depth, lake.lake_quality))
    except:
        print('Error for lake %i (detection quality = %.5f) ... skipping:' % (i+1, lake.detection_quality))
        traceback.print_exc()
        lake.lake_quality = 0.0

# remove zero quality lakes
# lake_list[:] = [lake for lake in lake_list if lake.lake_quality > 0]

# for each lake 
for i, lake in enumerate(lake_list):
    lake.lake_id = '%s_%s_%s_%04i' % (lake.polygon_name, lake.granule_id[:-3], lake.gtx, i)
    filename_base = 'lake_%05i_%s_%s_%s' % (np.clip(1000-lake.lake_quality,0,None)*10, 
                                                       lake.ice_sheet, lake.melt_season, 
                                                       lake.lake_id)
    # plot each lake and save to image
    fig = lake.plot_lake(closefig=True)
    figname = args.out_plot_dir + '/%s.jpg' % filename_base
    if fig is not None: fig.savefig(figname, dpi=300, bbox_inches='tight', pad_inches=0)
    
    # export each lake to h5 and pickle
    try:
        h5name = args.out_data_dir + '/%s.h5' % filename_base
        datafile = lake.write_to_hdf5(h5name)
        print('Wrote data file: %s, %s' % (datafile, get_size(datafile)))
    except:
        print('Could not write hdf5 file <%s>' % lake.lake_id)
        # try:
        #     pklname = args.out_data_dir + '/%s.pkl' % filename_base
        #     with open(pklname, 'wb') as f: pickle.dump(vars(lake), f)
        #     print('Wrote data file: %s, %s' % (pklname, get_size(pklname)))
        # except:
        #     print('Could not write pickle file.')

statsfname = args.out_stat_dir + '/stats_%s_%s.csv' % (args.polygon[args.polygon.rfind('/')+1:].replace('.geojson',''), args.granule[:-4])
with open(statsfname, 'w') as f: print('%.3f,%.3f,%i,%i,%s' % tuple(granule_stats+[compute_latlon]), file=f)

# clean up the input data
#########os.remove(input_filename)

print('\n-------------------------------------------------')
print(  '----------->   Python script done!   <-----------')
print(  '-------------------------------------------------\n')

Sucessfully retrieved data from NSIDC!!

GRANULE STATS (length total, length lakes, photons total, photons lakes):848116.206,0.000,6862994,0

---> determining depth for each lake

-------------------------------------------------
----------->   Python script done!   <-----------
-------------------------------------------------



In [None]:
%matplotlib widget
import geopandas as gpd
import pandas as pd
import h5py
import matplotlib.pyplot as plt
lake_list[0].photon_data
fig, ax = plt.subplots(figsize=[6,3])
dfl = lake_list[0].photon_data
ax.scatter(dfl.xatc, dfl.h, s=1, c='k', alpha=1)

In [None]:
lake_list[0].photon_data

In [None]:
from psutil import Process
df = lake_list[0].photon_data.copy()
start_mem = df.memory_usage().sum() / 1024
print('Memory usage of dataframe is {:.2f} KB'.format(start_mem))
print('Total: {:.2f} MB'.format(Process().memory_info().rss/1024**2))

# df.xatc-=df.xatc.min()
# df['xatc'] = df.xatc.astype(np.float32)
df['geoid'] = df.geoid.astype(np.float16)
df['h'] = df.h.astype(np.float32)
# df['dt'] = df.dt.astype(np.int32)
df['snr'] = df.snr.astype(pd.SparseDtype(np.float32, fill_value=0))
df['is_afterpulse'] = df.is_afterpulse.astype(pd.SparseDtype(np.bool_, fill_value=False))
df['prob_afterpulse'] = df.prob_afterpulse.astype(pd.SparseDtype(np.float16, fill_value=0))
df['sat_ratio'] = df.sat_ratio.astype(pd.SparseDtype(np.float16, fill_value=0))
df['sat_ratio_smooth'] = df.sat_ratio_smooth.astype(pd.SparseDtype(np.float16, fill_value=0))
df['sat_elev'] = df.sat_elev.astype(pd.SparseDtype(np.float32, fill_value=np.nan))

end_mem = df.memory_usage().sum() / 1024
print('Memory usage of dataframe is {:.2f} KB'.format(end_mem))
print('Total: {:.2f} MB'.format(Process().memory_info().rss/1024**2))

In [None]:
pd.Series(pd.arrays.SparseArray([False]*10, fill_value=False))

In [None]:
bla = pd.DataFrame(pd.arrays.SparseArray(np.zeros(10).astype(np.float16), fill_value=0))
bla.loc[3] = 1.9032093

In [None]:
gtx = 'gt2l'
polygon = args.polygon
verbose = False
print(input_filename, gtx, polygon)

In [None]:
gtx_list, ancillary, photon_data = read_atl03(input_filename, geoid_h=True, gtxs_to_read=gtx)

In [None]:
from psutil import Process
df = photon_data[gtx].copy()
start_mem = df.memory_usage().sum() / 1024**2
print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
print('Total: {:.2f} MB'.format(Process().memory_info().rss/1024**2))

df.xatc-=df.xatc.min()
df['xatc'] = df.xatc.astype(np.float32)
df['geoid'] = df.geoid.astype(np.float32)
df['h'] = df.h.astype(np.float32)
df['dt'] = df.dt.astype(np.int32)

end_mem = df.memory_usage().sum() / 1024**2
print('Memory usage of dataframe is {:.2f} MB'.format(end_mem))
print('Total: {:.2f} MB'.format(Process().memory_info().rss/1024**2))

In [None]:
df.dt.astype(np.int32)

In [None]:
df.xatc - df.xatc.min()

In [None]:
df.memory_usage()

In [None]:
filename = input_filename
geoid_h=True
gtxs_to_read='gt3l'

In [None]:
%matplotlib widget
import geopandas as gpd
import pandas as pd
import h5py
import matplotlib.pyplot as plt
# input_filename = 'IS2data/processed_ATL03_20200225170832_09310610_006_01.h5'
gtx = 'gt2r'
gdf_poly = gpd.read_file(args.polygon)
poly_nonsimplified = args.polygon.replace('simplified_', '')
clip_shape = gpd.read_file(poly_nonsimplified)
dfs = [gdf_poly, clip_shape]
# dfs=[]
for gtx in ['gt1r', 'gt2r', 'gt3r']:
    with h5py.File(input_filename, 'r') as f:
        lon_ph = f[gtx]['heights']['lon_ph'][:]
        lat_ph = f[gtx]['heights']['lat_ph'][:]
        df = pd.DataFrame({'lat_ph': lat_ph, 'lon': lon_ph})
        df['lat'] = np.round(df.lat_ph,2)
        df_gt = df.groupby('lat')[['lon']].median()
        df_gt.reset_index(inplace=True)
        gdf_gt = gpd.GeoDataFrame(geometry=gpd.points_from_xy(df_gt.lon, df_gt.lat), crs="EPSG:4326")
        dfs.append(gdf_gt)

gdf = pd.concat(dfs)
gdf.explore()

In [None]:
input_filename = 'IS2data/processed_ATL03_20200225170832_09310610_006_01.h5'
gtx = 'gt2l'
with h5py.File(input_filename, 'r') as f:
    segment_id = f[gtx]['geolocation']['segment_id'][:]
    ph_index_beg = f[gtx]['geolocation']['ph_index_beg'][:]
fig, ax = plt.subplots(figsize=[6,3])
ax.scatter(segment_id, ph_index_beg, s=1, c='k', alpha=0.1)
ax.set_xlabel('%s/geolocation/segment_id' % gtx)
ax.set_ylabel('%s/geolocation/ph_index_beg' % gtx)
fig.tight_layout()  

In [None]:
%matplotlib widget
import geopandas as gpd
import pandas as pd
import h5py
import matplotlib.pyplot as plt
beam = 'gt2l'
gtx = beam

with h5py.File(input_filename, 'r') as f:
    segment_id = f[gtx]['geolocation']['segment_id'][:]
    ph_index_beg = f[gtx]['geolocation']['ph_index_beg'][:]
    segment_ph_cnt = f[gtx]['geolocation']['segment_ph_cnt'][:]
    segment_dist_x = f[gtx]['geolocation']['segment_dist_x'][:]
    dist_ph_along = f[gtx]['heights']['dist_ph_along'][:]
    pce_mframe_cnt = f[gtx]['heights']['pce_mframe_cnt'][:]
    x_atc = f[gtx]['heights']['dist_ph_along'][:]
    dt = f[gtx]['heights']['delta_time'][:]
    h = f[gtx]['heights']['h_ph'][:]

fig, ax = plt.subplots(figsize=[9,5])
ax.scatter(segment_id, ph_index_beg, s=1, c='k', alpha=0.1)

ph_index_beg = np.concatenate(([0], np.cumsum(segment_ph_cnt[:-1])))

print(ph_index_beg)
print(segment_ph_cnt)
print(np.sum(segment_ph_cnt) - len(dt))
print(len(dt))
print(np.sum(segment_ph_cnt))
print(len(segment_id))
print(pce_mframe_cnt)

In [None]:
fig, ax = plt.subplots(figsize=[9,5])
xmin = 181907.259119056
xmax = 192586.55220003167
xmin = 0.259119056
xmax = 192586.55220003167
idxs = np.array(range(len(x_atc)))
idx_beg = ph_index_beg[(ph_index_beg > xmin) & (ph_index_beg < xmax)]
sel = (idxs > xmin) & (idxs < xmax)
ax.scatter(idxs[sel], x_atc[sel], s=1, c='k', alpha=0.5)
# ax.scatter(segment_id, ph_index_beg_, s=1, c='k', alpha=0.5)
for idx in idx_beg:
    ax.plot([idx]*2, [0,20], 'r-', lw=0.5)

In [None]:
fig, ax = plt.subplots(figsize=[9,5])
ax.scatter(segment_id, ph_index_beg_, s=1, c='k', alpha=0.1)

In [None]:

fig, ax = plt.subplots(figsize=[6,3])
with h5py.File(filename, 'r') as f:
    segment_id = f[gtx]['geolocation']['segment_id'][:]
    ph_index_beg = f[gtx]['geolocation']['ph_index_beg'][:] - 1
ax.scatter(segment_id, ph_index_beg, s=1, c='k', alpha=0.1)
ax.set_xlabel('%s/geolocation/segment_id' % gtx)
ax.set_ylabel('%s/geolocation/ph_index_beg' % gtx)
fig.tight_layout()  

In [None]:
import h5py
input_filename = 'IS2data/processed_ATL03_20200225170832_09310610_006_01.h5'
gtx = 'gt2l'
fig, ax = plt.subplots(figsize=[6,3])
with h5py.File(input_filename, 'r') as f:
    segment_id = f[gtx]['geolocation']['segment_id'][:]
    ph_index_beg = f[gtx]['geolocation']['ph_index_beg'][:] - 1
    segment_ph_cnt = f[gtx]['geolocation']['segment_ph_cnt'][:]
    segment_dist_x = f[gtx]['geolocation']['segment_dist_x'][:]
    dist_ph_along = f[gtx]['heights']['dist_ph_along'][:]

# ax.scatter(segment_id, ph_index_beg, s=1, c='k', alpha=0.1)

idxs = np.concatenate(([0], np.cumsum(segment_ph_cnt[:-1])))
# ax.scatter(segment_id, idxs, s=1, c='k', alpha=0.1)
# ax.scatter(segment_id, ph_index_beg-idxs, s=1, c='k', alpha=0.1)
ax.scatter(segment_id, segment_dist_x, s=1, c='k', alpha=0.1)

In [None]:
fig, ax = plt.subplots(figsize=[6,3])
ax.scatter(segment_id, ph_index_beg-idxs, s=1, c='k', alpha=0.1)

In [None]:
beam = 'gt2r'
gtx = beam
gtxs_to_read=beam

print('  reading in', filename)
granule_id = filename[filename.find('ATL03_'):(filename.find('.h5')+3)]

# open file
f = h5py.File(filename, 'r')

# make dictionaries for beam data to be stored in
dfs = {}
dfs_bckgrd = {}
all_beams = ['gt1l', 'gt1r', 'gt2l', 'gt2r', 'gt3l', 'gt3r']
beams_available = [beam for beam in all_beams if "/%s/heights/" % beam in f]

if gtxs_to_read=='all':
    beamlist = beams_available
elif gtxs_to_read=='none':
    beamlist = []
else:
    if type(gtxs_to_read)==list: beamlist = list(set(gtxs_to_read).intersection(set(beams_available)))
    elif type(gtxs_to_read)==str: beamlist = list(set([gtxs_to_read]).intersection(set(beams_available)))
    else: beamlist = beams_available

conf_landice = 3 # index for the land ice confidence

orient = f['orbit_info']['sc_orient'][0]
def orient_string(sc_orient):
    if sc_orient == 0:
        return 'backward'
    elif sc_orient == 1:
        return 'forward'
    elif sc_orient == 2:
        return 'transition'
    else:
        return 'error'
    
orient_str = orient_string(orient)
gtl = ['gt1l', 'gt1r', 'gt2l', 'gt2r', 'gt3l', 'gt3r']
beam_strength_dict = {k:['weak','strong'][k%2] for k in np.arange(1,7,1)}
if orient_str == 'forward':
    bl = np.arange(6,0,-1)
    gtx_beam_dict = {k:v for (k,v) in zip(gtl,bl)}
    gtx_strength_dict = {k:beam_strength_dict[gtx_beam_dict[k]] for k in gtl}
elif orient_str == 'backward':
    bl = np.arange(1,7,1)
    gtx_beam_dict = {k:v for (k,v) in zip(gtl,bl)}
    gtx_strength_dict = {k:beam_strength_dict[gtx_beam_dict[k]] for k in gtl}
else:
    gtx_beam_dict = {k:'undefined' for k in gtl}
    gtx_strength_dict = {k:'undefined' for k in gtl}
    

ancillary = {'granule_id': granule_id,
             'atlas_sdp_gps_epoch': f['ancillary_data']['atlas_sdp_gps_epoch'][0],
             'rgt': f['orbit_info']['rgt'][0],
             'cycle_number': f['orbit_info']['cycle_number'][0],
             'sc_orient': orient_str,
             'gtx_beam_dict': gtx_beam_dict,
             'gtx_strength_dict': gtx_strength_dict,
             'gtx_dead_time_dict': {}}

# loop through all beams
print('  reading in beam:', end=' ')
for beam in beamlist:
    
    print(beam, end=' ')
    try:
        
        if gtx_strength_dict[beam]=='strong':
            ancillary['gtx_dead_time_dict'][beam] = np.mean(np.array(f['ancillary_data']['calibrations']['dead_time'][beam]['dead_time'])[:16])
        else:
            ancillary['gtx_dead_time_dict'][beam] = np.mean(np.array(f['ancillary_data']['calibrations']['dead_time'][beam]['dead_time'])[16:])
           
        #### get photon-level data
        # if "/%s/heights/" not in f: break; # 
         
        df = pd.DataFrame({'lat': np.array(f[beam]['heights']['lat_ph']),
                           'lon': np.array(f[beam]['heights']['lon_ph']),
                           'h': np.array(f[beam]['heights']['h_ph']),
                           'dt': np.array(f[beam]['heights']['delta_time']),
                           # 'conf': np.array(f[beam]['heights']['signal_conf_ph'][:,conf_landice]),
                           # not using ATL03 confidences here
                           'mframe': np.array(f[beam]['heights']['pce_mframe_cnt']),
                           'ph_id_pulse': np.array(f[beam]['heights']['ph_id_pulse']),
                           'qual': np.array(f[beam]['heights']['quality_ph'])}) 
        #### calculate along-track distances [meters from the equator crossing] from segment-level data
        segment_id = f[beam]['geolocation']['segment_id'][:]
        n_seg = len(segment_id)
        segment_ph_cnt = f[beam]['geolocation']['segment_ph_cnt'][:]
        ph_index_beg = np.concatenate(([0], np.cumsum(segment_ph_cnt[:-1])))
        segment_dist_x = f[beam]['geolocation']['segment_dist_x'][:]
        x_atc = f[beam]['heights']['dist_ph_along'][:]
        # for each 20m segment
        for j,_ in enumerate(segment_id):
            idx = ph_index_beg[j]
            cnt = segment_ph_cnt[j]
            # skip segments with no photon events
            if (cnt == 0):
                continue
            # add segment distance to along-track coordinates
            x_atc[idx:idx+cnt] += segment_dist_x[j]
        df['xatc'] = x_atc
        # ph_index_beg = np.int64(f[beam]['geolocation']['ph_index_beg']) - 1
        # segment_dist_x = np.array(f[beam]['geolocation']['segment_dist_x'])
        # segment_length = np.array(f[beam]['geolocation']['segment_length'])
        # valid = ph_index_beg>=0 # need to delete values where there's no photons in the segment (-1 value)
        # df.loc[ph_index_beg[valid], 'xatc'] = segment_dist_x[valid]
        # df.xatc.fillna(method='ffill',inplace=True)
        # df.xatc += np.array(f[beam]['heights']['dist_ph_along'])

        #### now we can filter out TEP (we don't do IRF / afterpulses because it seems to not be very good...)
        df.query('qual < 3',inplace=True) 
        # df.drop(columns=['qual'], inplace=True)

        #### sort by along-track distance (for interpolation to work smoothly)
        df.sort_values(by='xatc',inplace=True)
        df.reset_index(inplace=True, drop=True)

        # if geoid_h:
        #     #### interpolate geoid to photon level using along-track distance, and add to elevation
        #     geophys_geoid = np.array(f[beam]['geophys_corr']['geoid'])
        #     geophys_geoid_x = segment_dist_x+0.5*segment_length
        #     valid_geoid = geophys_geoid<1e10 # filter out INVALID_R4B fill values
        #     geophys_geoid = geophys_geoid[valid_geoid]
        #     geophys_geoid_x = geophys_geoid_x[valid_geoid]
        #     # hacky fix for no weird stuff happening if geoid is undefined everywhere
        #     if len(geophys_geoid>5):
        #         geoid = np.interp(np.array(df.xatc), geophys_geoid_x, geophys_geoid)
        #         df['h'] = df.h - geoid
        #         df['geoid'] = geoid
        #         del geoid
        #     else:
        #         df['geoid'] = 0.0

        #### save to list of dataframes
        dfs[beam] = df
        del df 
        gc.collect()
        #Mdfs_bckgrd[beam] = df_bckgrd
    
    except:
        print('Error for {f:s} on {b:s} ... skipping:'.format(f=filename, b=beam))
        traceback.print_exc()
        
f.close()

In [None]:
beam = 'gt1r'
gtx = beam

with h5py.File(filename, 'r') as f:
    segment_id = f[gtx]['geolocation']['segment_id'][:]
    ph_index_beg_ = f[gtx]['geolocation']['ph_index_beg'][:]
    segment_ph_cnt = f[gtx]['geolocation']['segment_ph_cnt'][:]
    segment_dist_x = f[gtx]['geolocation']['segment_dist_x'][:]
    dist_ph_along = f[gtx]['heights']['dist_ph_along'][:]
    pce_mframe_cnt = f[gtx]['heights']['pce_mframe_cnt'][:]
    x_atc = f[gtx]['heights']['dist_ph_along'][:]
    dt = f[gtx]['heights']['delta_time'][:]
    h = f[gtx]['heights']['h_ph'][:]

ph_index_beg = np.concatenate(([0], np.cumsum(segment_ph_cnt[:-1])))

print(ph_index_beg)
print(ph_index_beg_)
print(segment_ph_cnt)
print(np.sum(segment_ph_cnt) - len(dt))
print(len(dt))
print(np.sum(segment_ph_cnt))
print(len(segment_id))
print(pce_mframe_cnt)

In [None]:
fig, ax = plt.subplots(figsize=[9,5])
xmin = 181907.259119056
xmax = 192586.55220003167
xmin = 0.259119056
xmax = 192586.55220003167
idxs = np.array(range(len(x_atc)))
idx_beg = ph_index_beg[(ph_index_beg > xmin) & (ph_index_beg < xmax)]
sel = (idxs > xmin) & (idxs < xmax)
ax.scatter(idxs[sel], x_atc[sel], s=1, c='k', alpha=0.5)
# ax.scatter(segment_id, ph_index_beg_, s=1, c='k', alpha=0.5)
for idx in idx_beg:
    ax.plot([idx]*2, [0,20], 'r-', lw=0.5)

In [None]:
fig, ax = plt.subplots(figsize=[9,5])
ax.scatter(x_atc, h, s=1, c='k', alpha=0.1)

In [None]:
fig, ax = plt.subplots(figsize=[9,5])
ax.scatter(dt, h, s=1, c='k', alpha=0.1)

In [None]:
fig, ax = plt.subplots(figsize=[9,5])
xmin = 181907.259119056
xmax = 192586.55220003167
idxs = np.array(range(len(x_atc)))
idx_beg = ph_index_beg[(ph_index_beg > xmin) & (ph_index_beg < xmax)]
sel = (idxs > xmin) & (idxs < xmax)
ax.scatter(idxs[sel], x_atc[sel], s=1, c='k', alpha=0.1)
for idx in idx_beg:
    ax.plot([idx]*2, [0,20], 'r-', lw=0.5)

In [None]:
ph_index_beg

In [None]:
len(x_atc)

In [None]:
np.sum(x_atc > xmin)

In [None]:
np.sum(x_atc < xmax)

In [None]:
fig, ax = plt.subplots(figsize=[9,5])
ax.scatter(dfs[beam].xatc, dfs[beam].h, s=1, c='k', alpha=0.1)

In [None]:
fig, ax = plt.subplots(figsize=[9,5])
ax.scatter(dfs[beam].dt, dfs[beam].h, s=1, c='k', alpha=0.1)

In [None]:
import pandas as pd
f = h5py.File(filename, 'r')
beam = 'gt2r'
fig, ax = plt.subplots(figsize=[9,5])
# ax.plot(Segment_Distance)
# ax.plot(Segment_Index_begin)
# ax.plot(Segment_ID)
# ax.plot(Segment_PE_count)
ax.plot(f[beam]['geolocation']['ph_index_beg'][:] - 1)
ax.plot(np.array(f[beam]['geolocation']['ph_index_beg'], dtype=np.int64) - 1)

In [None]:
%matplotlib widget
import matplotlib.pyplot as plt

In [None]:
gtx = 'gt1l'
fig, ax = plt.subplots(figsize=[9,5])
ax.scatter(dfs[gtx].dt, dfs[gtx].h, s=1, c='k', alpha=0.1)

In [None]:
fig, ax = plt.subplots(figsize=[9,5])
ax.scatter(dfs[gtx].xatc, dfs[gtx].h, s=1, c='k', alpha=0.1)

In [None]:
def read_atl03(filename, geoid_h=True, gtxs_to_read='all'):
    """
    Read in an ATL03 granule. 

    Parameters
    ----------
    filename : string
        the file path of the granule to be read in
    geoid_h : boolean
        whether to include the ATL03-supplied geoid correction for photon heights

    Returns
    -------
    dfs : dict of pandas dataframes
          photon-rate data with keys ['gt1l', 'gt1r', 'gt2l', 'gt2r', 'gt3l', 'gt3r']
          each dataframe contains the following variables
          lat : float64, latitude of the photon, degrees
          lon : float64, longitude of the photon, degrees
          h : float64, elevation of the photon (geoid correction applied if geoid_h=True), meters
          dt : float64, delta time of the photon, seconds from the ATLAS SDP GPS Epoch
          mframe : uint32, the ICESat-2 major frame that the photon belongs to
          qual : int8, quality flag 0=nominal,1=possible_afterpulse,2=possible_impulse_response_effect,3=possible_tep
          xatc : float64, along-track distance of the photon, meters
          geoid : float64, geoid correction that was applied to photon elevation (supplied if geoid_h=True), meters
    dfs_bckgrd : dict of pandas dataframes
                 photon-rate data with keys ['gt1l', 'gt1r', 'gt2l', 'gt2r', 'gt3l', 'gt3r']
                 each dataframe contains the following variables
                 pce_mframe_cnt : int64, the major frame that the data belongs to
                 bckgrd_counts : int32, number of background photons
                 bckgrd_int_height : float32, height of the background window, meters
                 delta_time : float64, Time at the start of ATLAS 50-shot sum, seconds from the ATLAS SDP GPS Epoch
    ancillary : dictionary with the following keys:
                granule_id : string, the producer granule id, extracted from filename
                atlas_sdp_gps_epoch : float64, reference GPS time for ATLAS in seconds [1198800018.0]
                rgt : int16, the reference ground track number
                cycle_number : int8, the ICESat-2 cycle number of the granule
                sc_orient : the spacecraft orientation (usually 'forward' or 'backward')
                gtx_beam_dict : dictionary of the ground track / beam number configuration 
                                example: {'gt1l': 6, 'gt1r': 5, 'gt2l': 4, 'gt2r': 3, 'gt3l': 2, 'gt3r': 1}
                gtx_strength_dict': dictionary of the ground track / beam strength configuration
                                    example: {'gt1l': 'weak','gt1r': 'strong','gt2l': 'weak', ... }
                                    
    Examples
    --------
    >>> read_atl03(filename='processed_ATL03_20210715182907_03381203_005_01.h5', geoid_h=True)
    """
    
    print('  reading in', filename)
    granule_id = filename[filename.find('ATL03_'):(filename.find('.h5')+3)]
    
    # open file
    f = h5py.File(filename, 'r')
    
    # make dictionaries for beam data to be stored in
    dfs = {}
    dfs_bckgrd = {}
    all_beams = ['gt1l', 'gt1r', 'gt2l', 'gt2r', 'gt3l', 'gt3r']
    beams_available = [beam for beam in all_beams if "/%s/heights/" % beam in f]
    
    if gtxs_to_read=='all':
        beamlist = beams_available
    elif gtxs_to_read=='none':
        beamlist = []
    else:
        if type(gtxs_to_read)==list: beamlist = list(set(gtxs_to_read).intersection(set(beams_available)))
        elif type(gtxs_to_read)==str: beamlist = list(set([gtxs_to_read]).intersection(set(beams_available)))
        else: beamlist = beams_available
    
    conf_landice = 3 # index for the land ice confidence
    
    orient = f['orbit_info']['sc_orient'][0]
    def orient_string(sc_orient):
        if sc_orient == 0:
            return 'backward'
        elif sc_orient == 1:
            return 'forward'
        elif sc_orient == 2:
            return 'transition'
        else:
            return 'error'
        
    orient_str = orient_string(orient)
    gtl = ['gt1l', 'gt1r', 'gt2l', 'gt2r', 'gt3l', 'gt3r']
    beam_strength_dict = {k:['weak','strong'][k%2] for k in np.arange(1,7,1)}
    if orient_str == 'forward':
        bl = np.arange(6,0,-1)
        gtx_beam_dict = {k:v for (k,v) in zip(gtl,bl)}
        gtx_strength_dict = {k:beam_strength_dict[gtx_beam_dict[k]] for k in gtl}
    elif orient_str == 'backward':
        bl = np.arange(1,7,1)
        gtx_beam_dict = {k:v for (k,v) in zip(gtl,bl)}
        gtx_strength_dict = {k:beam_strength_dict[gtx_beam_dict[k]] for k in gtl}
    else:
        gtx_beam_dict = {k:'undefined' for k in gtl}
        gtx_strength_dict = {k:'undefined' for k in gtl}
        

    ancillary = {'granule_id': granule_id,
                 'atlas_sdp_gps_epoch': f['ancillary_data']['atlas_sdp_gps_epoch'][0],
                 'rgt': f['orbit_info']['rgt'][0],
                 'cycle_number': f['orbit_info']['cycle_number'][0],
                 'sc_orient': orient_str,
                 'gtx_beam_dict': gtx_beam_dict,
                 'gtx_strength_dict': gtx_strength_dict,
                 'gtx_dead_time_dict': {}}

    # loop through all beams
    print('  reading in beam:', end=' ')
    for beam in beamlist:
        
        print(beam, end=' ')
        try:
            
            if gtx_strength_dict[beam]=='strong':
                ancillary['gtx_dead_time_dict'][beam] = np.mean(np.array(f['ancillary_data']['calibrations']['dead_time'][beam]['dead_time'])[:16])
            else:
                ancillary['gtx_dead_time_dict'][beam] = np.mean(np.array(f['ancillary_data']['calibrations']['dead_time'][beam]['dead_time'])[16:])
               
            #### get photon-level data
            # if "/%s/heights/" not in f: break; # 
             
            df = pd.DataFrame({'lat': np.array(f[beam]['heights']['lat_ph']),
                               'lon': np.array(f[beam]['heights']['lon_ph']),
                               'h': np.array(f[beam]['heights']['h_ph']),
                               'dt': np.array(f[beam]['heights']['delta_time']),
                               # 'conf': np.array(f[beam]['heights']['signal_conf_ph'][:,conf_landice]),
                               # not using ATL03 confidences here
                               'mframe': np.array(f[beam]['heights']['pce_mframe_cnt']),
                               'ph_id_pulse': np.array(f[beam]['heights']['ph_id_pulse']),
                               'qual': np.array(f[beam]['heights']['quality_ph'])}) 
                               # 0=nominal,1=afterpulse,2=impulse_response_effect,3=tep
#            if 'weight_ph' in f[beam]['heights'].keys():
#                 df['weight_ph'] = np.array(f[beam]['heights']['weight_ph'])
# 
#             df_bckgrd = pd.DataFrame({'pce_mframe_cnt': np.array(f[beam]['bckgrd_atlas']['pce_mframe_cnt']),
#                                       'bckgrd_counts': np.array(f[beam]['bckgrd_atlas']['bckgrd_counts']),
#                                       'bckgrd_int_height': np.array(f[beam]['bckgrd_atlas']['bckgrd_int_height']),
#                                       'delta_time': np.array(f[beam]['bckgrd_atlas']['delta_time'])})

            #### calculate along-track distances [meters from the equator crossing] from segment-level data
            df['xatc'] = np.full_like(df.lat, fill_value=np.nan)
            ph_index_beg = np.int64(f[beam]['geolocation']['ph_index_beg']) - 1
            segment_dist_x = np.array(f[beam]['geolocation']['segment_dist_x'])
            segment_length = np.array(f[beam]['geolocation']['segment_length'])
            valid = ph_index_beg>=0 # need to delete values where there's no photons in the segment (-1 value)
            df.loc[ph_index_beg[valid], 'xatc'] = segment_dist_x[valid]
            df.xatc.fillna(method='ffill',inplace=True)
            df.xatc += np.array(f[beam]['heights']['dist_ph_along'])

            #### now we can filter out TEP (we don't do IRF / afterpulses because it seems to not be very good...)
            df.query('qual < 3',inplace=True) 
            # df.drop(columns=['qual'], inplace=True)

            #### sort by along-track distance (for interpolation to work smoothly)
            df.sort_values(by='xatc',inplace=True)
            df.reset_index(inplace=True, drop=True)

            if geoid_h:
                #### interpolate geoid to photon level using along-track distance, and add to elevation
                geophys_geoid = np.array(f[beam]['geophys_corr']['geoid'])
                geophys_geoid_x = segment_dist_x+0.5*segment_length
                valid_geoid = geophys_geoid<1e10 # filter out INVALID_R4B fill values
                geophys_geoid = geophys_geoid[valid_geoid]
                geophys_geoid_x = geophys_geoid_x[valid_geoid]
                # hacky fix for no weird stuff happening if geoid is undefined everywhere
                if len(geophys_geoid>5):
                    geoid = np.interp(np.array(df.xatc), geophys_geoid_x, geophys_geoid)
                    df['h'] = df.h - geoid
                    df['geoid'] = geoid
                    del geoid
                else:
                    df['geoid'] = 0.0

            #### save to list of dataframes
            dfs[beam] = df
            del df 
            gc.collect()
            #Mdfs_bckgrd[beam] = df_bckgrd
        
        except:
            print('Error for {f:s} on {b:s} ... skipping:'.format(f=filename, b=beam))
            traceback.print_exc()
            
    f.close()
    print(' --> done.')
    if len(beamlist)==0:
        return beams_available, ancillary
    else:
        return beams_available, ancillary, dfs

In [None]:
def detect_lakes(input_filename, gtx, polygon, verbose=False):
    
    gtx_list, ancillary, photon_data = read_atl03(input_filename, geoid_h=True, gtxs_to_read=gtx)
    if len(photon_data)==0: return [], [0,0,0,0]
    
    print('\n-----------------------------------------------------------------------------\n')
    print('PROCESSING GROUND TRACK: %s (%s)' % (gtx, ancillary['gtx_strength_dict'][gtx]))

    # get the data frame for the gtx and aggregate info at major frame level
    #df = photon_data[gtx]
    #====================================================================================
    #====================================================================================
    #====================================================================================
    #====================================================================================
    #====================================================================================
    # TODO: CLIP THE DATAFRAME TO THE NON-SIMPLIFIED POLYGON FOR THE REGION TO AVOID OVERLAP
    poly_nonsimplified = polygon.replace('simplified_', '')
    gdf = gpd.GeoDataFrame(photon_data[gtx], geometry=gpd.points_from_xy(photon_data[gtx].lon, photon_data[gtx].lat), crs="EPSG:4326")
    clip_shape = gpd.read_file(poly_nonsimplified)
    gdf = gpd.clip(gdf, clip_shape).reset_index(drop=True)
    df = pd.DataFrame(gdf.drop(columns='geometry'), copy=True)
    photon_data = None
    gdf = None
    del gdf, photon_data, clip_shape
    gc.collect()
    
    df_mframe = make_mframe_df(df)
    
    # get all the flat segments and select
    df_mframe = find_flat_lake_surfaces(df_mframe, df)
    df_selected = df_mframe[df_mframe.is_flat]
    
    # calculate densities and find second peaks (where surface is flat)
    nsubsegs = 10
    get_densities_and_2nd_peaks(df, df_mframe, df_selected, gtx, ancillary, n_subsegs=nsubsegs, print_results=verbose)
    
    # iteratively merge the detected segments into lakes 
    df_lakes = merge_lakes(df_mframe, print_progress=verbose, debug=verbose)
    if df_lakes is None: 
        return [], [df.xatc.max()-df.xatc.min(), 0.0, df.h.count(), 0]
    df_lakes = check_lake_surroundings(df_mframe, df_lakes)
    calculate_remaining_densities(df, df_mframe, df_lakes, gtx, ancillary)
    
    # create a list of lake object, and calculate some stats for each
    thelakes = []
    if df_lakes is not None:
        for i in range(len(df_lakes)):
            lakedata = df_lakes.iloc[i]
            thislake = melt_lake(lakedata.mframe_start, lakedata.mframe_end, lakedata.surf_elev, nsubsegs)
            thislake.add_data(df, df_mframe, gtx, ancillary, polygon)
            thislake.get_surface_elevation()
            thislake.get_surface_extent()
            thislake.calc_quality_lake()
            thelakes.append(thislake)
    
    # remove any duplicates and make sure data segments don't overlap into other lakes' water surfaces
    thelakes = remove_duplicate_lakes(thelakes, df, df_mframe, gtx, ancillary, polygon, nsubsegs, verbose=verbose)          
    print_results(thelakes, gtx)
    
    # get gtx stats
    gtx_stats = get_gtx_stats(df, thelakes)

    del df, df_mframe, df_selected, df_lakes
    gc.collect()
    
    return thelakes, gtx_stats

In [None]:
import argparse
import os
import sys
import pickle
import subprocess
import numpy as np
import icelakes
from icelakes.utilities import encedc, decedc, get_size
from icelakes.nsidc import download_granule, edc
from icelakes.detection import read_atl03, detect_lakes, melt_lake

parser = argparse.ArgumentParser(description='Test script to print some stats for a given ICESat-2 ATL03 granule.')
parser.add_argument('--granule', type=str, default='ATL03_20220714010847_03381603_006_02.h5',
                    help='The producer_id of the input ATL03 granule')
parser.add_argument('--polygon', type=str, default='geojsons/jakobshavn_small.geojson',
                    help='The file path of a geojson file for spatial subsetting')
parser.add_argument('--is2_data_dir', type=str, default='IS2data',
                    help='The directory into which to download ICESat-2 granules')
parser.add_argument('--download_gtxs', type=str, default='all',
                    help='String value or list of gtx names to download, also accepts "all"')
parser.add_argument('--out_data_dir', type=str, default='detection_out_data',
                    help='The directory to which to write the output data')
parser.add_argument('--out_plot_dir', type=str, default='detection_out_plot',
                    help='The directory to which to write the output plots')
parser.add_argument('--out_stat_dir', type=str, default='detection_out_stat',
                    help='The directory to which to write the granule stats')
# args = parser.parse_args()
args = parser.parse_args("")

In [None]:
# try to figure out where the script is being executed (just to show those maps at conferences, etc...)
try:
    with open('location-wrapper.sh', 'rb') as file: script = file.read()
    geoip_out = subprocess.run(script, shell=True, capture_output=True)
    compute_latlon = str(geoip_out.stdout)[str(geoip_out.stdout).find('<x><y><z>')+9 : str(geoip_out.stdout).find('<z><y><x>')]
    print('\nThis job is running at the following lat/lon location:%s\n' % compute_latlon)
except:
    compute_latlon='0.0,0.0'
    print('\nUnable to determine compute location for this script.\n')

# # shuffling files around for HTCondor
# for thispath in (args.is2_data_dir, args.out_data_dir, args.out_plot_dir):
#     if not os.path.exists(thispath): os.makedirs(thispath)

# # download the specified ICESat-2 data from NSIDC
# input_filename, request_status_code = download_granule(args.granule, args.download_gtxs, args.polygon, args.is2_data_dir, 
#                                              decedc(edc().u), decedc(edc().p))

# # perform a bunch of checks to make sure everything went alright with the nsidc api
# print('Request status code:', request_status_code, request_status_code==200)
# if request_status_code != 200:
#     print('NSIDC API request failed.')
#     sys.exit(127)
# if request_status_code==200:
#     with open('success.txt', 'w') as f: print('we got some sweet data', file=f)
#     if input_filename == 'none': 
#         print('granule seems to be empty. nothing more to do here.') 
#         sys.exit(69)
# if os.path.exists(input_filename):
#     if os.path.getsize(input_filename) < 31457280:# 30 MB
#         print('granule seems to be empty. nothing more to do here.') 
#         sys.exit(69)


In [None]:
input_filename = 'IS2data/processed_ATL03_20220714010847_03381603_006_02.h5'
gtx_list, ancillary = read_atl03(input_filename, gtxs_to_read='none')

In [None]:
# detect melt lakes
lake_list = []
granule_stats = [0,0,0,0]

for gtx in gtx_list:
    lakes_found, gtx_stats = detect_lakes(input_filename, gtx, args.polygon, verbose=False)
    for i in range(len(granule_stats)): granule_stats[i] += gtx_stats[i]
    lake_list += lakes_found

if granule_stats[0] > 0:
    with open('success.txt', 'w') as f: print('we got some data from NSIDC!!', file=f)
    print('Sucessfully retrieved data from NSIDC!!')
    
# print stats for granule
print('\nGRANULE STATS (length total, length lakes, photons total, photons lakes):%.3f,%.3f,%i,%i\n' % tuple(granule_stats))

# for each lake call the surrf algorithm for depth determination
print('---> determining depth for each lake')
for lake in lake_list:
    lake.surrf()
    print('   --> %8.3fN, %8.3fE: %6.2fm deep / quality: %8.2f' % (lake.lat,lake.lon,lake.max_depth,lake.lake_quality))

# remove zero quality lakes
lake_list[:] = [lake for lake in lake_list if lake.lake_quality > 0]

for i, lake in enumerate(lake_list):
    lake.lake_id = '%s_%s_%s_%04i' % (lake.polygon_name, lake.granule_id[:-3], lake.gtx, i)
    filename_base = 'lake_%05i_%s_%s_%s' % (np.clip(1000-lake.lake_quality,0,None)*10, 
                                                       lake.ice_sheet, lake.melt_season, 
                                                       lake.lake_id)
    # plot each lake and save to image
    fig = lake.plot_lake(closefig=False)
    figname = args.out_plot_dir + '/%s.jpg' % filename_base
    if fig is not None: fig.savefig(figname, dpi=300, bbox_inches='tight', pad_inches=0)
    
    # export each lake to h5 and pickle
    try:
        h5name = args.out_data_dir + '/%s.h5' % filename_base
        datafile = lake.write_to_hdf5(h5name)
        print('Wrote data file: %s, %s' % (datafile, get_size(datafile)))
    except:
        print('Could not write hdf5 file.')
        try:
            pklname = args.out_data_dir + '/%s.pkl' % filename_base
            with open(pklname, 'wb') as f: pickle.dump(vars(lake), f)
            print('Wrote data file: %s, %s' % (pklname, get_size(pklname)))
        except:
            print('Could not write pickle file.')

statsfname = args.out_stat_dir + '/stats_%s_%s.csv' % (args.polygon[args.polygon.rfind('/')+1:].replace('.geojson',''), args.granule[:-4])
with open(statsfname, 'w') as f: print('%.3f,%.3f,%i,%i,%s' % tuple(granule_stats+[compute_latlon]), file=f)
    
# clean up the input data
# os.remove(input_filename)

print('\n-------------------------------------------------')
print(  '----------->   Python script done!   <-----------')
print(  '-------------------------------------------------\n')

In [None]:
%matplotlib widget
import matplotlib.pylab as plt
from cmcrameri import cm as cmc
lk = lake_list[2]
dfp = lk.photon_data
dfs = dfp[~dfp.is_afterpulse]
dfap = dfp[dfp.is_afterpulse]
fig, ax = plt.subplots(figsize=[8, 4.5], dpi=100)
ax.scatter(dfs.xatc, dfs.h-lk.surface_elevation, s=1, c=dfs.snr, cmap=cmc.batlow_r, vmin=0, vmax=1)
ax.scatter(dfap.xatc, dfap.h-lk.surface_elevation, s=1, c=dfap.snr, cmap=cmc.batlow_r, alpha=0.2, vmin=0, vmax=1)
# ax.scatter(dfp.xatc, dfp.h, s=10, c='g')
dfg = dfs.groupby('pulseid').mean()
ax.scatter(dfg.xatc, dfg.sat_ratio.rolling(20,center=True).mean(), s=1, c=dfg.sat_ratio, cmap=cmc.roma_r, alpha=0.2, vmin=0, vmax=1)

In [None]:
dfs.groupby('pulseid').mean()

In [None]:
dfp

In [None]:
from utils import *
from os import listdir, makedirs
from os.path import isfile, join, exists
import os
import rsa

# arguments for future script

In [None]:
granule = 'ATL03_20210715182907_03381203_005_01.h5'
shapefile = '/shapefiles/jakobshavn_small.shp'
gtxs = 'gt1l'

# download the specified granule via NSIDC
...and subset to the provided shapefile / only pull the variables needed

In [None]:
datadir = '/IS2data'

In [None]:
# %%capture celloutput 
download_granule_nsidc(granule, gtxs, shapefile, datadir, decedc(edc().u), decedc(edc().p))

# read in the .h5 subsetted granule file

In [None]:
filelist = [datadir[1:]+'/'+f for f in listdir(datadir[1:]) if isfile(join(datadir[1:], f)) & ('.h5' in f)]
print('\nNumber of processed ATL03 granules to read in: ' + str(len(filelist)))
    
photon_data, bckgrd_data, ancillary = read_atl03(filelist[0], geoid_h=True)
print_granule_stats(photon_data, bckgrd_data, ancillary, outfile='stats.txt')