# Make Granule List

In [1]:
# import utility function for making granule list
from icelakes.nsidc import make_granule_list
import numpy as np
import pandas as pd
import os
import geopandas as gpd
from shapely.geometry import Polygon
from shapely.geometry.polygon import orient

In [None]:
# # function to convert shapefile to geojson - if needed
# shp2geojson_nsidc('shapefiles/jakobshavn_small.shp')

lons = [-154, -153]
lats = [-85.46, -85.41]

coords = [(lons[x[0]], lats[x[1]]) for x in [(0,0), (1,0), (1,1), (0,1), (0,0)]]
poly = Polygon(coords)
gdf = gpd.GeoDataFrame(geometry=[poly], crs='EPSG:4326') 
fn_gjsn = 'geojsons/test_ross.geojson'
gdf.to_file(fn_gjsn, driver='GeoJSON')
outname_list = fn_gjsn.split('/')[-1].replace('.geojson','.csv')

In [None]:
coords

In [None]:
df = make_granule_list(fn_gjsn.split('/')[-1], start_date='2022-01-04', end_date='2022-01-04', 
                       icesheet='AIS', meltseason='2021-22', list_out_name=outname_list,
                       version=None)
df

In [None]:
'granule_lists/GRE_2000_May2019_Jun2023.csv'
'granule_lists/ANT_1000_Dec2018_Mar2021.csv'
'granule_lists/ANT_1000_Dec2021_Mar2023.csv'
'granule_lists/GRE_2000_extraMaySep.csv'
'granule_lists/GRE_2000_May2023_Sep2023_newdata.csv'

'granule_lists/alldata_2018-2023.csv'

# Greenland 2023 (new data only)

In [69]:
filename_new = 'granule_lists/GRE_2000_May2023_Sep2023_newdata.csv'
filename_all = 'granule_lists/alldata_2018-2023.csv'

startyear = 2023
endyear = 2023
startday = '05-01'
endday = '09-30'
icesheet = 'GrIS'

searchfor = 'simplified_GRE_2000'
gjsn_dir = 'geojsons'
gjsn_list = [gjsn_dir+'/'+f for f in os.listdir(gjsn_dir) \
            if os.path.isfile(os.path.join(gjsn_dir, f)) & (searchfor in f)]
print('____________________________________________________________________________')
print('GREENLAND 2023')
print('____________________________________________________________________________')

dflist = []
for i, gjsn in enumerate(gjsn_list):
    geojson = gjsn[gjsn.rfind('/')+1:]
    print('----------------------------------------------------------------')
    print(geojson, '(', i+1, '/', len(gjsn_list), ')')
    for yr in np.arange(startyear, endyear+1):
        start_date = '%s-%s' % (yr, startday)
        end_date = '%s-%s' % (yr, endday)
        meltseason = start_date[:4] if start_date[:4]==end_date[:4] else start_date[:4] + '-' + end_date[2:4]
        outname = 'granule_lists/' + icesheet + '_' + meltseason + '_' + geojson.replace('.geojson','') + '.csv'
        df = make_granule_list(geojson, start_date, end_date, icesheet, meltseason, outname)
        dflist.append(df)

df_all = pd.concat(dflist)
df_all.loc[:, 'description'] = df_all.apply(lambda x: x.loc['description'].replace('simplified_','').replace('GRE_','').replace('ANT_',''), axis=1)
df_all.reset_index(inplace=True, drop=True)

def filter_release(df):
    # there should not be any, but to make sure drop any duplicates
    df = df.drop_duplicates(ignore_index=True)
    
    # keep only the latest release of each granule (relevant during ITRF reprocessing, etc.)
    df['release'] = df.apply(lambda x: int(x.granule.split('_')[-1][:2]), axis=1)
    df['version'] = df.apply(lambda x: int(x.granule.split('_')[3]), axis=1)
    df['ttttccnn'] = df.apply(lambda x: int(x.granule.split('_')[2]), axis=1)
    
    # make id for granule / region combination
    df['granule_region_id'] = df.apply(lambda x: str(x.ttttccnn) + x.geojson_clip, axis=1)
    df_maxversion = df.groupby(by='granule_region_id')[['version']].max().rename(columns={'version': 'version_max'})
    df['select'] = df.apply(lambda x: x.version == df_maxversion.loc[x.granule_region_id].version_max, axis=1)
    df = df[df.select]
    df_maxrelease = df.groupby(by='granule_region_id')[['release']].max().rename(columns={'release': 'release_max'})
    df['select'] = df.apply(lambda x: x.release == df_maxrelease.loc[x.granule_region_id].release_max, axis=1)
    df = df[df.select]

    return df

# get the granules that have already been procesed
in_list = [
    'granule_lists/GRE_2000_May2019_Jun2023.csv',
    'granule_lists/ANT_1000_Dec2018_Mar2021.csv',
    'granule_lists/ANT_1000_Dec2021_Mar2023.csv',
    'granule_lists/extra_shoulderseason_GRE_2000_ANT_1000.csv']
nms = ['granule', 'geojson', 'description', 'geojson_clip', 'size_mb']
dfs_all_input = []
for grlist in in_list:
    dfs_all_input.append(pd.read_csv(grlist.replace('.csv', '_size.csv'), header=None, names=nms))
df_processed = pd.concat(dfs_all_input).reset_index(drop=True)
df_already_run = filter_release(df_processed)

df_new = filter_release(df_all)
ids_run_already = list(df_already_run.granule_region_id)
df_new['select'] = df_new.apply(lambda x: x.granule_region_id not in ids_run_already, axis=1)
df_new = df_new[df_new.select].reset_index(drop=True)
df_new.iloc[:,:5]
df_new.iloc[:,:5].to_csv(filename_new.replace('.csv', '_size.csv'), header=False, index=False)

print('Number of ganules over Greenland:', np.sum(df_new.apply(lambda x: 'GrIS' in x.loc['description'], axis=1)))
print('Number of ganules over Antarctica:', np.sum(df_new.apply(lambda x: 'AIS' in x.loc['description'], axis=1)))
print('Total number of granules:', len(df_new))
maxrow = df_new.loc[np.argmax(df_new.size_mb),:]
print('Largest granule: %.1f GB, %s, %s' % (maxrow.size_mb/1000, maxrow.granule, maxrow.geojson))
print('Total size: %.2f TB' % (np.sum(df_new.size_mb)/1e6))

df_new_nosize = df_new.iloc[:,:4].copy()
df_new_nosize.to_csv(filename_new, header=False, index=False)

print('____________________________________________________________________________')
print('ALL DATA COMBINED')
df_everything = pd.concat((df_new, df_already_run)).drop_duplicates(ignore_index=True, subset=['granule_region_id']).sort_values(by='granule').reset_index(drop=True)
print('Number of ganules over Greenland:', np.sum(df_everything.apply(lambda x: 'GrIS' in x.loc['description'], axis=1)))
print('Number of ganules over Antarctica:', np.sum(df_everything.apply(lambda x: 'AIS' in x.loc['description'], axis=1)))
print('Total number of granules:', len(df_everything))
maxrow = df_everything.loc[np.argmax(df_everything.size_mb),:]
print('Largest granule: %.1f GB, %s, %s' % (maxrow.size_mb/1000, maxrow.granule, maxrow.geojson))
print('Total size: %.2f TB' % (np.sum(df_everything.size_mb)/1e6))

df_everything.iloc[:,:5].to_csv(filename_all.replace('.csv', '_size.csv'), header=False, index=False)
df_everything_nosize = df_everything.iloc[:,:4].copy()
df_everything_nosize.to_csv(filename_all, header=False, index=False)

____________________________________________________________________________
GREENLAND 2023
____________________________________________________________________________
----------------------------------------------------------------
simplified_GRE_2000_NE.geojson ( 1 / 7 )
Found 391 ATL03 version 006 granules over simplified_GRE_2000_NE.geojson between 2023-05-01 and 2023-09-30.
----------------------------------------------------------------
simplified_GRE_2000_SW.geojson ( 2 / 7 )
Found 97 ATL03 version 006 granules over simplified_GRE_2000_SW.geojson between 2023-05-01 and 2023-09-30.
----------------------------------------------------------------
simplified_GRE_2000_CE.geojson ( 3 / 7 )
Found 184 ATL03 version 006 granules over simplified_GRE_2000_CE.geojson between 2023-05-01 and 2023-09-30.
----------------------------------------------------------------
simplified_GRE_2000_NO.geojson ( 4 / 7 )
Found 883 ATL03 version 006 granules over simplified_GRE_2000_NO.geojson between 202

In [58]:
def filter_release(df):
    # there should not be any, but to make sure drop any duplicates
    df = df.drop_duplicates(ignore_index=True)
    
    # keep only the latest release of each granule (relevant during ITRF reprocessing, etc.)
    df['release'] = df.apply(lambda x: int(x.granule.split('_')[-1][:2]), axis=1)
    df['version'] = df.apply(lambda x: int(x.granule.split('_')[3]), axis=1)
    df['ttttccnn'] = df.apply(lambda x: int(x.granule.split('_')[2]), axis=1)
    
    # make id for granule / region combination
    df['granule_region_id'] = df.apply(lambda x: str(x.ttttccnn) + x.geojson_clip, axis=1)
    df_maxversion = df.groupby(by='granule_region_id')[['version']].max().rename(columns={'version': 'version_max'})
    df['select'] = df.apply(lambda x: x.version == df_maxversion.loc[x.granule_region_id].version_max, axis=1)
    df = df[df.select]
    df_maxrelease = df.groupby(by='granule_region_id')[['release']].max().rename(columns={'release': 'release_max'})
    df['select'] = df.apply(lambda x: x.release == df_maxrelease.loc[x.granule_region_id].release_max, axis=1)
    df = df[df.select]

    return df

In [59]:
# get the granules that have already been procesed
in_list = [
    'granule_lists/GRE_2000_May2019_Jun2023.csv',
    'granule_lists/ANT_1000_Dec2018_Mar2021.csv',
    'granule_lists/ANT_1000_Dec2021_Mar2023.csv',
    'granule_lists/extra_shoulderseason_GRE_2000_ANT_1000.csv']
nms = ['granule', 'geojson', 'description', 'geojson_clip', 'size_mb']
dfs_all_input = []
for grlist in in_list:
    dfs_all_input.append(pd.read_csv(grlist.replace('.csv', '_size.csv'), header=None, names=nms))
df_processed = pd.concat(dfs_all_input).reset_index(drop=True)
df_already_run = filter_release(df_processed)

In [60]:
# counts = df_already_run.groupby('granule_region_id')[['granule']].count().rename(columns={'granule': 'counts'})
# counts = counts[counts.counts > 1]
# for i in range(len(counts)):
#     thisdf = df_already_run[df_already_run.granule_region_id == list(counts.index)[0]]
#     print(' ')
#     for j in range(len(thisdf)):
#         x = thisdf.iloc[j]
#         print(x.granule, x.geojson, x.description)

In [61]:
filename_out = 'granule_lists/GRE_2000_May2023_Sep2023_newdata.csv'

df_new = filter_release(df_all)
ids_run_already = list(df_already_run.granule_region_id)
df_new['select'] = df_new.apply(lambda x: x.granule_region_id not in ids_run_already, axis=1)
df_new = df_new[df_new.select].reset_index(drop=True)
df_new.iloc[:,:5]
df_new.iloc[:,:5].to_csv(filename_out.replace('.csv', '_size.csv'), header=False, index=False)

print('Number of ganules over Greenland:', np.sum(df_new.apply(lambda x: 'GrIS' in x.loc['description'], axis=1)))
print('Number of ganules over Antarctica:', np.sum(df_new.apply(lambda x: 'AIS' in x.loc['description'], axis=1)))
print('Total number of granules:', len(df_new))
maxrow = df_new.loc[np.argmax(df_new.size_mb),:]
print('Largest granule: %.1f GB, %s, %s' % (maxrow.size_mb/1000, maxrow.granule, maxrow.geojson))
print('Total size: %.2f TB' % (np.sum(df_new.size_mb)/1e6))

df_new_nosize = df_new.iloc[:,:4].copy()
df_new_nosize.to_csv(filename_out, header=False, index=False)

filename_all = 'granule_lists/alldata_2018-2023.csv'
df_everything = pd.concat((df_new, df_already_run)).drop_duplicates(ignore_index=True, subset=['granule_region_id']).sort_values(by='granule').reset_index(drop=True)
print('Number of ganules over Greenland:', np.sum(df_everything.apply(lambda x: 'GrIS' in x.loc['description'], axis=1)))
print('Number of ganules over Antarctica:', np.sum(df_everything.apply(lambda x: 'AIS' in x.loc['description'], axis=1)))
print('Total number of granules:', len(df_everything))
maxrow = df_everything.loc[np.argmax(df_everything.size_mb),:]
print('Largest granule: %.1f GB, %s, %s' % (maxrow.size_mb/1000, maxrow.granule, maxrow.geojson))
print('Total size: %.2f TB' % (np.sum(df_everything.size_mb)/1e6))

df_everything.iloc[:,:5].to_csv(filename_all.replace('.csv', '_size.csv'), header=False, index=False)
df_everything_nosize = df_everything.iloc[:,:4].copy()
df_everything_nosize.to_csv(filename_all, header=False, index=False)
df_everything_nosize

Number of ganules over Greenland: 1374
Number of ganules over Antarctica: 0
Total number of granules: 1374
Largest granule: 7.5 GB, ATL03_20230630192302_01622005_006_02.h5, geojsons/simplified_GRE_2000_NE.geojson
Total size: 3.24 TB


Unnamed: 0,granule,geojson,description,geojson_clip
0,ATL03_20230622065452_00322003_006_02.h5,geojsons/simplified_GRE_2000_NE.geojson,GrIS_2023_2000_NE,geojsons/GRE_2000_NE.geojson
1,ATL03_20230622070018_00322004_006_02.h5,geojsons/simplified_GRE_2000_NE.geojson,GrIS_2023_2000_NE,geojsons/GRE_2000_NE.geojson
2,ATL03_20230623062913_00472003_006_02.h5,geojsons/simplified_GRE_2000_NE.geojson,GrIS_2023_2000_NE,geojsons/GRE_2000_NE.geojson
3,ATL03_20230623204315_00562004_006_02.h5,geojsons/simplified_GRE_2000_NE.geojson,GrIS_2023_2000_NE,geojsons/GRE_2000_NE.geojson
4,ATL03_20230623204824_00562005_006_02.h5,geojsons/simplified_GRE_2000_NE.geojson,GrIS_2023_2000_NE,geojsons/GRE_2000_NE.geojson
...,...,...,...,...
1369,ATL03_20230921165327_00412105_006_02.h5,geojsons/simplified_GRE_2000_CW.geojson,GrIS_2023_2000_CW,geojsons/GRE_2000_CW.geojson
1370,ATL03_20230925040010_00942103_006_02.h5,geojsons/simplified_GRE_2000_CW.geojson,GrIS_2023_2000_CW,geojsons/GRE_2000_CW.geojson
1371,ATL03_20230925164503_01022105_006_02.h5,geojsons/simplified_GRE_2000_CW.geojson,GrIS_2023_2000_CW,geojsons/GRE_2000_CW.geojson
1372,ATL03_20230929035148_01552103_006_02.h5,geojsons/simplified_GRE_2000_CW.geojson,GrIS_2023_2000_CW,geojsons/GRE_2000_CW.geojson


In [67]:
filename_all = 'granule_lists/alldata_2018-2023.csv'
df_everything = pd.concat((df_new, df_already_run)).drop_duplicates(ignore_index=True, subset=['granule_region_id']).sort_values(by='granule').reset_index(drop=True)
print('Number of ganules over Greenland:', np.sum(df_everything.apply(lambda x: 'GrIS' in x.loc['description'], axis=1)))
print('Number of ganules over Antarctica:', np.sum(df_everything.apply(lambda x: 'AIS' in x.loc['description'], axis=1)))
print('Total number of granules:', len(df_everything))
maxrow = df_everything.loc[np.argmax(df_everything.size_mb),:]
print('Largest granule: %.1f GB, %s, %s' % (maxrow.size_mb/1000, maxrow.granule, maxrow.geojson))
print('Total size: %.2f TB' % (np.sum(df_everything.size_mb)/1e6))

df_everything.iloc[:,:5].to_csv(filename_all.replace('.csv', '_size.csv'), header=False, index=False)
df_everything_nosize = df_everything.iloc[:,:4].copy()
df_everything_nosize.to_csv(filename_all, header=False, index=False)
df_everything_nosize

Number of ganules over Greenland: 10136
Number of ganules over Antarctica: 42753
Total number of granules: 52889
Largest granule: 12.2 GB, ATL03_20220511191525_07591505_006_01.h5, geojsons/simplified_GRE_2000_NO.geojson
Total size: 135.50 TB


Unnamed: 0,granule,geojson,description,geojson_clip
0,ATL03_20181031235247_05090112_006_02.h5,geojsons/simplified_ANT_1000_West_F-G.geojson,AIS_2018_1000_West_F-G,geojsons/ANT_1000_West_F-G.geojson
1,ATL03_20181101024758_05110110_006_02.h5,geojsons/simplified_ANT_1000_East_K-A.geojson,AIS_2018_1000_East_K-A,geojsons/ANT_1000_East_K-A.geojson
2,ATL03_20181101025539_05110111_006_02.h5,geojsons/simplified_ANT_1000_West_Ep-F.geojson,AIS_2018_1000_West_Ep-F,geojsons/ANT_1000_West_Ep-F.geojson
3,ATL03_20181101030122_05110112_006_02.h5,geojsons/simplified_ANT_1000_West_Ep-F.geojson,AIS_2018_1000_West_Ep-F,geojsons/ANT_1000_West_Ep-F.geojson
4,ATL03_20181101042215_05120110_006_02.h5,geojsons/simplified_ANT_1000_East_K-A.geojson,AIS_2018_1000_East_K-A,geojsons/ANT_1000_East_K-A.geojson
...,...,...,...,...
52884,ATL03_20230930161101_01782105_006_02.h5,geojsons/simplified_GRE_2000_NO.geojson,GrIS_2023_2000_NO,geojsons/GRE_2000_NO.geojson
52885,ATL03_20230930161101_01782105_006_02.h5,geojsons/simplified_GRE_2000_NE.geojson,GrIS_2023_2000_NE,geojsons/GRE_2000_NE.geojson
52886,ATL03_20230930174010_01792104_006_02.h5,geojsons/simplified_GRE_2000_NO.geojson,GrIS_2023_2000_NO,geojsons/GRE_2000_NO.geojson
52887,ATL03_20230930174519_01792105_006_02.h5,geojsons/simplified_GRE_2000_NO.geojson,GrIS_2023_2000_NO,geojsons/GRE_2000_NO.geojson


In [47]:
print(len(df_already_run.granule_region_id), len(np.unique(df_already_run.granule_region_id)))

51814 51515


In [33]:
'272005geojsons/GRE_2000_NO.geojson' in list(df_already_run.granule_region_id)

True

In [35]:
ids_run_already = list(df_already_run.granule_region_id)
df_new['select'] = df_new.apply(lambda x: x.granule_region_id not in ids_run_already, axis=1)
print(np.sum(df_new.select))
df_already_run
df_new

1374


Unnamed: 0,granule,geojson,description,geojson_clip,size_mb,granule_norelease,release,version,ttttccnn,granule_region_id,select
0,ATL03_20230501225732_06351904_006_02.h5,geojsons/simplified_GRE_2000_NE.geojson,GrIS_2023_2000_NE,geojsons/GRE_2000_NE.geojson,2336.841153,ATL03_20230501225732_06351904_006,2,6,6351904,6351904geojsons/GRE_2000_NE.geojson,False
1,ATL03_20230501230241_06351905_006_02.h5,geojsons/simplified_GRE_2000_NE.geojson,GrIS_2023_2000_NE,geojsons/GRE_2000_NE.geojson,1538.320868,ATL03_20230501230241_06351905_006,2,6,6351905,6351905geojsons/GRE_2000_NE.geojson,False
2,ATL03_20230502223152_06501904_006_02.h5,geojsons/simplified_GRE_2000_NE.geojson,GrIS_2023_2000_NE,geojsons/GRE_2000_NE.geojson,1913.935168,ATL03_20230502223152_06501904_006,2,6,6501904,6501904geojsons/GRE_2000_NE.geojson,False
3,ATL03_20230502223701_06501905_006_02.h5,geojsons/simplified_GRE_2000_NE.geojson,GrIS_2023_2000_NE,geojsons/GRE_2000_NE.geojson,1979.593719,ATL03_20230502223701_06501905_006,2,6,6501905,6501905geojsons/GRE_2000_NE.geojson,False
4,ATL03_20230503092628_06571903_006_02.h5,geojsons/simplified_GRE_2000_NE.geojson,GrIS_2023_2000_NE,geojsons/GRE_2000_NE.geojson,7311.369440,ATL03_20230503092628_06571903_006,2,6,6571903,6571903geojsons/GRE_2000_NE.geojson,False
...,...,...,...,...,...,...,...,...,...,...,...
2063,ATL03_20230921165327_00412105_006_02.h5,geojsons/simplified_GRE_2000_CW.geojson,GrIS_2023_2000_CW,geojsons/GRE_2000_CW.geojson,3272.413673,ATL03_20230921165327_00412105_006,2,6,412105,412105geojsons/GRE_2000_CW.geojson,True
2064,ATL03_20230925040010_00942103_006_02.h5,geojsons/simplified_GRE_2000_CW.geojson,GrIS_2023_2000_CW,geojsons/GRE_2000_CW.geojson,501.831805,ATL03_20230925040010_00942103_006,2,6,942103,942103geojsons/GRE_2000_CW.geojson,True
2065,ATL03_20230925164503_01022105_006_02.h5,geojsons/simplified_GRE_2000_CW.geojson,GrIS_2023_2000_CW,geojsons/GRE_2000_CW.geojson,2889.815962,ATL03_20230925164503_01022105_006,2,6,1022105,1022105geojsons/GRE_2000_CW.geojson,True
2066,ATL03_20230929035148_01552103_006_02.h5,geojsons/simplified_GRE_2000_CW.geojson,GrIS_2023_2000_CW,geojsons/GRE_2000_CW.geojson,833.089432,ATL03_20230929035148_01552103_006,2,6,1552103,1552103geojsons/GRE_2000_CW.geojson,True


In [31]:
df_already_run.sort_values(by='granule')

Unnamed: 0,granule,geojson,description,geojson_clip,size_mb,granule_norelease,release,version,ttttccnn,granule_region_id,select
37830,ATL03_20181031235247_05090112_006_02.h5,geojsons/simplified_ANT_1000_West_F-G.geojson,AIS_2018_1000_West_F-G,geojsons/ANT_1000_West_F-G.geojson,2392.604625,ATL03_20181031235247_05090112_006,2,6,5090112,5090112geojsons/ANT_1000_West_F-G.geojson,True
37439,ATL03_20181101024758_05110110_006_02.h5,geojsons/simplified_ANT_1000_East_K-A.geojson,AIS_2018_1000_East_K-A,geojsons/ANT_1000_East_K-A.geojson,852.422195,ATL03_20181101024758_05110110_006,2,6,5110110,5110110geojsons/ANT_1000_East_K-A.geojson,True
35939,ATL03_20181101025539_05110111_006_02.h5,geojsons/simplified_ANT_1000_West_Ep-F.geojson,AIS_2018_1000_West_Ep-F,geojsons/ANT_1000_West_Ep-F.geojson,2514.029597,ATL03_20181101025539_05110111_006,2,6,5110111,5110111geojsons/ANT_1000_West_Ep-F.geojson,True
35940,ATL03_20181101030122_05110112_006_02.h5,geojsons/simplified_ANT_1000_West_Ep-F.geojson,AIS_2018_1000_West_Ep-F,geojsons/ANT_1000_West_Ep-F.geojson,1739.434034,ATL03_20181101030122_05110112_006,2,6,5110112,5110112geojsons/ANT_1000_West_Ep-F.geojson,True
34989,ATL03_20181101042215_05120110_006_02.h5,geojsons/simplified_ANT_1000_East_Jpp-K.geojson,AIS_2018_1000_East_Jpp-K,geojsons/ANT_1000_East_Jpp-K.geojson,568.969159,ATL03_20181101042215_05120110_006,2,6,5120110,5120110geojsons/ANT_1000_East_Jpp-K.geojson,True
...,...,...,...,...,...,...,...,...,...,...,...
5239,ATL03_20230621213434_00262004_006_01.h5,geojsons/simplified_GRE_2000_NO.geojson,GrIS_2023_2000_NO,geojsons/GRE_2000_NO.geojson,3110.911763,ATL03_20230621213434_00262004_006,1,6,262004,262004geojsons/GRE_2000_NO.geojson,True
6683,ATL03_20230621213943_00262005_006_01.h5,geojsons/simplified_GRE_2000_NW.geojson,GrIS_2023_2000_NW,geojsons/GRE_2000_NW.geojson,4143.117353,ATL03_20230621213943_00262005_006,1,6,262005,262005geojsons/GRE_2000_NW.geojson,True
6999,ATL03_20230621213943_00262005_006_01.h5,geojsons/simplified_GRE_2000_CW.geojson,GrIS_2023_2000_CW,geojsons/GRE_2000_CW.geojson,4143.117353,ATL03_20230621213943_00262005_006,1,6,262005,262005geojsons/GRE_2000_CW.geojson,True
6684,ATL03_20230621231400_00272005_006_01.h5,geojsons/simplified_GRE_2000_NW.geojson,GrIS_2023_2000_NW,geojsons/GRE_2000_NW.geojson,4928.093561,ATL03_20230621231400_00272005_006,1,6,272005,272005geojsons/GRE_2000_NW.geojson,True


In [None]:
df_already_run.apply(lambda x: x.granule[6:14], axis=1)

In [None]:
dfnewall = df_all.copy()
dfnewall['granule_norelease'] = dfnewall.apply(lambda x: x.granule[:x.granule.rfind('_')], axis=1)
dfnewall['release'] = dfnewall.apply(lambda x: int(x.granule.split('_')[-1][:2]), axis=1)
print(len(np.unique(dfnewall.granule_norelease)), len(np.unique(dfnewall.granule)), len(dfnewall))

print(dfnewall.groupby(by='release')['granule'].count())
dfnewall

In [None]:
np.sum(df.select)

# Greenland 2019 - June 2023

In [None]:
dflist = []
filename_out = 'granule_lists/GRE_2000_May2019_Jun2023.csv'

startyear = 2019
endyear = 2023
startday = '05-15'
endday = '09-15'
icesheet = 'GrIS'

searchfor = 'simplified_GRE_2000'
gjsn_dir = 'geojsons'
gjsn_list = [gjsn_dir+'/'+f for f in os.listdir(gjsn_dir) \
            if os.path.isfile(os.path.join(gjsn_dir, f)) & (searchfor in f)]
print('____________________________________________________________________________')
print('GREENLAND')
print('____________________________________________________________________________')

for i, gjsn in enumerate(gjsn_list):
    geojson = gjsn[gjsn.rfind('/')+1:]
    print('----------------------------------------------------------------')
    print(geojson, '(', i+1, '/', len(gjsn_list), ')')
    for yr in np.arange(startyear, endyear+1):
        start_date = '%s-%s' % (yr, startday)
        end_date = '%s-%s' % (yr, endday)
        meltseason = start_date[:4] if start_date[:4]==end_date[:4] else start_date[:4] + '-' + end_date[2:4]
        outname = 'granule_lists/' + icesheet + '_' + meltseason + '_' + geojson.replace('.geojson','') + '.csv'
        df = make_granule_list(geojson, start_date, end_date, icesheet, meltseason, outname)
        dflist.append(df)

df_all = pd.concat(dflist)
df_all.loc[:, 'description'] = df_all.apply(lambda x: x.loc['description'].replace('simplified_','').replace('GRE_','').replace('ANT_',''), axis=1)
df_all.reset_index(inplace=True, drop=True)
df_all.to_csv(filename_out.replace('.csv', '_size.csv'), header=False, index=False)

print('Number of ganules over Greenland:', np.sum(df_all.apply(lambda x: 'GrIS' in x.loc['description'], axis=1)))
print('Number of ganules over Antarctica:', np.sum(df_all.apply(lambda x: 'AIS' in x.loc['description'], axis=1)))
print('Total number of granules:', len(df_all))
maxrow = df_all.loc[np.argmax(df_all.size_mb),:]
print('Largest granule: %.1f GB, %s, %s' % (maxrow.size_mb/1000, maxrow.granule, maxrow.geojson))
print('Total size: %.2f TB' % (np.sum(df_all.size_mb)/1e6))

df_all_nosize = df_all.drop(columns='size_mb').copy()
df_all_nosize.to_csv(filename_out, header=False, index=False)
df_all_nosize

# Antarctica 2018/19 - 2020/21

In [None]:
dflist = []
filename_out = 'granule_lists/ANT_1000_Dec2018_Mar2021.csv'

startyear = 2018
endyear = 2021
startday = '12-01'
endday = '03-01'
icesheet = 'AIS'

searchfor = 'simplified_ANT_1000'
gjsn_dir = 'geojsons'
gjsn_list = [gjsn_dir+'/'+f for f in os.listdir(gjsn_dir) \
            if os.path.isfile(os.path.join(gjsn_dir, f)) & (searchfor in f)]

print('____________________________________________________________________________')
print('ANTARCTICA 2018/19 - 2020/21')
print('____________________________________________________________________________')
for i, gjsn in enumerate(gjsn_list):
    geojson = gjsn[gjsn.rfind('/')+1:]
    print('----------------------------------------------------------------')
    print(geojson, '(', i+1, '/', len(gjsn_list), ')')
    for yr in np.arange(startyear, endyear):
        start_date = '%s-%s' % (yr, startday)
        end_date = '%s-%s' % (yr+1, endday)
        meltseason = start_date[:4] if start_date[:4]==end_date[:4] else start_date[:4] + '-' + end_date[2:4]
        outname = 'granule_lists/' + icesheet + '_' + meltseason + '_' + geojson.replace('.geojson','') + '.csv'
        df = make_granule_list(geojson, start_date, end_date, icesheet, meltseason, outname)
        dflist.append(df)

df_all = pd.concat(dflist)
df_all.loc[:, 'description'] = df_all.apply(lambda x: x.loc['description'].replace('simplified_','').replace('GRE_','').replace('ANT_',''), axis=1)
df_all.reset_index(inplace=True, drop=True)
df_all.to_csv(filename_out.replace('.csv', '_size.csv'), header=False, index=False)

print('Number of ganules over Greenland:', np.sum(df_all.apply(lambda x: 'GrIS' in x.loc['description'], axis=1)))
print('Number of ganules over Antarctica:', np.sum(df_all.apply(lambda x: 'AIS' in x.loc['description'], axis=1)))
print('Total number of granules:', len(df_all))
maxrow = df_all.loc[np.argmax(df_all.size_mb),:]
print('Largest granule: %.1f GB, %s, %s' % (maxrow.size_mb/1000, maxrow.granule, maxrow.geojson))
print('Total size: %.2f TB' % (np.sum(df_all.size_mb)/1e6))

df_all_nosize = df_all.drop(columns='size_mb').copy()
df_all_nosize.to_csv(filename_out, header=False, index=False)
df_all_nosize

# Antarctica 2021/22 - 2022/23

In [None]:
dflist = []
filename_out = 'granule_lists/ANT_1000_Dec2021_Mar2023.csv'

startyear = 2021
endyear = 2023
startday = '12-01'
endday = '03-01'
icesheet = 'AIS'

searchfor = 'simplified_ANT_1000'
gjsn_dir = 'geojsons'
gjsn_list = [gjsn_dir+'/'+f for f in os.listdir(gjsn_dir) \
            if os.path.isfile(os.path.join(gjsn_dir, f)) & (searchfor in f)]

print('____________________________________________________________________________')
print('ANTARCTICA 2021/22 - 2022/23')
print('____________________________________________________________________________')
for i, gjsn in enumerate(gjsn_list):
    geojson = gjsn[gjsn.rfind('/')+1:]
    print('----------------------------------------------------------------')
    print(geojson, '(', i+1, '/', len(gjsn_list), ')')
    for yr in np.arange(startyear, endyear):
        start_date = '%s-%s' % (yr, startday)
        end_date = '%s-%s' % (yr+1, endday)
        meltseason = start_date[:4] if start_date[:4]==end_date[:4] else start_date[:4] + '-' + end_date[2:4]
        outname = 'granule_lists/' + icesheet + '_' + meltseason + '_' + geojson.replace('.geojson','') + '.csv'
        df = make_granule_list(geojson, start_date, end_date, icesheet, meltseason, outname)
        dflist.append(df)

df_all = pd.concat(dflist)
df_all.loc[:, 'description'] = df_all.apply(lambda x: x.loc['description'].replace('simplified_','').replace('GRE_','').replace('ANT_',''), axis=1)
df_all.reset_index(inplace=True, drop=True)
df_all.to_csv(filename_out.replace('.csv', '_size.csv'), header=False, index=False)

print('Number of ganules over Greenland:', np.sum(df_all.apply(lambda x: 'GrIS' in x.loc['description'], axis=1)))
print('Number of ganules over Antarctica:', np.sum(df_all.apply(lambda x: 'AIS' in x.loc['description'], axis=1)))
print('Total number of granules:', len(df_all))
maxrow = df_all.loc[np.argmax(df_all.size_mb),:]
print('Largest granule: %.1f GB, %s, %s' % (maxrow.size_mb/1000, maxrow.granule, maxrow.geojson))
print('Total size: %.2f TB' % (np.sum(df_all.size_mb)/1e6))

df_all_nosize = df_all.drop(columns='size_mb').copy()
df_all_nosize.to_csv(filename_out, header=False, index=False)
df_all_nosize

# greenland extra

In [None]:
filename_out = 'granule_lists/GRE_2000_extraMaySep.csv'
dflist = []

startyear = 2019
endyear = 2023
icesheet = 'GrIS'
searchfor = 'simplified_GRE_2000'
gjsn_dir = 'geojsons'
gjsn_list = [gjsn_dir+'/'+f for f in os.listdir(gjsn_dir) \
            if os.path.isfile(os.path.join(gjsn_dir, f)) & (searchfor in f)]
print('____________________________________________________________________________')
print('GREENLAND EXTRA')
print('____________________________________________________________________________')

startday = '05-01'
endday = '05-14'
for i, gjsn in enumerate(gjsn_list):
    geojson = gjsn[gjsn.rfind('/')+1:]
    print('----------------------------------------------------------------')
    print(geojson, '(', i+1, '/', len(gjsn_list), ')')
    for yr in np.arange(startyear, endyear+1):
        start_date = '%s-%s' % (yr, startday)
        end_date = '%s-%s' % (yr, endday)
        meltseason = start_date[:4] if start_date[:4]==end_date[:4] else start_date[:4] + '-' + end_date[2:4]
        outname = 'granule_lists/' + icesheet + '_' + meltseason + '_' + geojson.replace('.geojson','') + '.csv'
        df = make_granule_list(geojson, start_date, end_date, icesheet, meltseason, outname)
        dflist.append(df)

startday = '09-16'
endday = '09-30'
for i, gjsn in enumerate(gjsn_list):
    geojson = gjsn[gjsn.rfind('/')+1:]
    print('----------------------------------------------------------------')
    print(geojson, '(', i+1, '/', len(gjsn_list), ')')
    for yr in np.arange(startyear, endyear+1):
        start_date = '%s-%s' % (yr, startday)
        end_date = '%s-%s' % (yr, endday)
        meltseason = start_date[:4] if start_date[:4]==end_date[:4] else start_date[:4] + '-' + end_date[2:4]
        outname = 'granule_lists/' + icesheet + '_' + meltseason + '_' + geojson.replace('.geojson','') + '.csv'
        df = make_granule_list(geojson, start_date, end_date, icesheet, meltseason, outname)
        dflist.append(df)

df_all = pd.concat(dflist)
df_all.loc[:, 'description'] = df_all.apply(lambda x: x.loc['description'].replace('simplified_','').replace('GRE_','').replace('ANT_',''), axis=1)
df_all.reset_index(inplace=True, drop=True)
df_all.to_csv(filename_out.replace('.csv', '_size.csv'), header=False, index=False)

print('Number of ganules over Greenland:', np.sum(df_all.apply(lambda x: 'GrIS' in x.loc['description'], axis=1)))
print('Number of ganules over Antarctica:', np.sum(df_all.apply(lambda x: 'AIS' in x.loc['description'], axis=1)))
print('Total number of granules:', len(df_all))
maxrow = df_all.loc[np.argmax(df_all.size_mb),:]
print('Largest granule: %.1f GB, %s, %s' % (maxrow.size_mb/1000, maxrow.granule, maxrow.geojson))
print('Total size: %.2f TB' % (np.sum(df_all.size_mb)/1e6))

df_all_nosize = df_all.drop(columns='size_mb').copy()
df_all_nosize.to_csv(filename_out, header=False, index=False)
df_all_nosize

# antarctica extra

In [None]:
dflist = []
filename_out = 'granule_lists/ANT_1000_extraNovMar.csv'

startyear = 2018
endyear = 2023
icesheet = 'AIS'
searchfor = 'simplified_ANT_1000'
gjsn_dir = 'geojsons'
gjsn_list = [gjsn_dir+'/'+f for f in os.listdir(gjsn_dir) \
            if os.path.isfile(os.path.join(gjsn_dir, f)) & (searchfor in f)]

print('____________________________________________________________________________')
print('ANTARCTICA EXTRA')
print('____________________________________________________________________________')

startday = '11-01'
endday = '11-30'
for i, gjsn in enumerate(gjsn_list):
    geojson = gjsn[gjsn.rfind('/')+1:]
    print('----------------------------------------------------------------')
    print(geojson, '(', i+1, '/', len(gjsn_list), ')')
    for yr in np.arange(startyear, endyear):
        start_date = '%s-%s' % (yr, startday)
        end_date = '%s-%s' % (yr, endday)
        meltseason = start_date[:4] if start_date[:4]==end_date[:4] else start_date[:4] + '-' + end_date[2:4]
        outname = 'granule_lists/' + icesheet + '_' + meltseason + '_' + geojson.replace('.geojson','') + '.csv'
        df = make_granule_list(geojson, start_date, end_date, icesheet, meltseason, outname)
        dflist.append(df)

startday = '03-01'
endday = '03-31'
for i, gjsn in enumerate(gjsn_list):
    geojson = gjsn[gjsn.rfind('/')+1:]
    print('----------------------------------------------------------------')
    print(geojson, '(', i+1, '/', len(gjsn_list), ')')
    for yr in np.arange(startyear, endyear):
        start_date = '%s-%s' % (yr+1, startday)
        end_date = '%s-%s' % (yr+1, endday)
        meltseason = start_date[:4] if start_date[:4]==end_date[:4] else start_date[:4] + '-' + end_date[2:4]
        outname = 'granule_lists/' + icesheet + '_' + meltseason + '_' + geojson.replace('.geojson','') + '.csv'
        df = make_granule_list(geojson, start_date, end_date, icesheet, meltseason, outname)
        dflist.append(df)

df_all = pd.concat(dflist)
df_all.loc[:, 'description'] = df_all.apply(lambda x: x.loc['description'].replace('simplified_','').replace('GRE_','').replace('ANT_',''), axis=1)
df_all.reset_index(inplace=True, drop=True)
df_all.to_csv(filename_out.replace('.csv', '_size.csv'), header=False, index=False)

print('Number of ganules over Greenland:', np.sum(df_all.apply(lambda x: 'GrIS' in x.loc['description'], axis=1)))
print('Number of ganules over Antarctica:', np.sum(df_all.apply(lambda x: 'AIS' in x.loc['description'], axis=1)))
print('Total number of granules:', len(df_all))
maxrow = df_all.loc[np.argmax(df_all.size_mb),:]
print('Largest granule: %.1f GB, %s, %s' % (maxrow.size_mb/1000, maxrow.granule, maxrow.geojson))
print('Total size: %.2f TB' % (np.sum(df_all.size_mb)/1e6))

df_all_nosize = df_all.drop(columns='size_mb').copy()
df_all_nosize.to_csv(filename_out, header=False, index=False)
df_all_nosize

# combine both ANT and GRE extra job inputs for shoulder season

In [None]:
df1 = pd.read_csv('granule_lists/GRE_2000_extraMaySep.csv',header=None)
df2 = pd.read_csv('granule_lists/ANT_1000_extraNovMar.csv',header=None)
dfb = pd.concat((df1,df2))
dfb.to_csv('granule_lists/extra_shoulderseason_GRE_2000_ANT_1000.csv', header=False, index=False)

nms = ['granule', 'geojson', 'description', 'geojson_clip', 'size_mb']
df1 = pd.read_csv('granule_lists/GRE_2000_extraMaySep_size.csv',header=None,names=nms)
df2 = pd.read_csv('granule_lists/ANT_1000_extraNovMar_size.csv',header=None,names=nms)
dfb = pd.concat((df1,df2)).reset_index(drop=True)
dfb.to_csv('granule_lists/extra_shoulderseason_GRE_2000_ANT_1000_size.csv', header=False, index=False)

print('Number of ganules over Greenland:', np.sum(dfb.apply(lambda x: 'GrIS' in x.loc['description'], axis=1)))
print('Number of ganules over Antarctica:', np.sum(dfb.apply(lambda x: 'AIS' in x.loc['description'], axis=1)))
print('Total number of granules:', len(dfb))
maxrow = dfb.loc[np.argmax(dfb.size_mb),:]
print('Largest granule: %.1f GB, %s, %s' % (maxrow.size_mb/1000, maxrow.granule, maxrow.geojson))
print('Total size: %.2f TB' % (np.sum(dfb.size_mb)/1e6))

In [None]:
17305+1762

# stats for all combined inputs

In [None]:
in_list = [
    'granule_lists/GRE_2000_May2019_Jun2023.csv',
    'granule_lists/ANT_1000_Dec2018_Mar2021.csv',
    'granule_lists/ANT_1000_Dec2021_Mar2023.csv',
    'granule_lists/extra_shoulderseason_GRE_2000_ANT_1000.csv']
nms = ['granule', 'geojson', 'description', 'geojson_clip', 'size_mb']
dfs_all_input = []
for grlist in in_list:
    dfs_all_input.append(pd.read_csv(grlist.replace('.csv', '_size.csv'), header=None, names=nms))
df_all_inputs = pd.concat(dfs_all_input).reset_index(drop=True)

print('Number of ganules over Greenland:', np.sum(df_all_inputs.apply(lambda x: 'GrIS' in x.loc['description'], axis=1)))
print('Number of ganules over Antarctica:', np.sum(df_all_inputs.apply(lambda x: 'AIS' in x.loc['description'], axis=1)))
print('Total number of granules:', len(df_all_inputs))
maxrow = df_all_inputs.loc[np.argmax(df_all_inputs.size_mb),:]
print('Largest granule: %.1f GB, %s, %s' % (maxrow.size_mb/1000, maxrow.granule, maxrow.geojson))
print('Total size: %.2f TB' % (np.sum(df_all_inputs.size_mb)/1e6))


In [None]:
np.mean(df_all_inputs.size_mb > 7000)

In [None]:
fn_in = 'granule_lists/GRE_2500_ANT_1500_Oct2018_Mar2023.csv'
fn_in = 'granule_lists/GRE_2000_ANT_1000_Oct2018_Mar2023.csv'
fn_in = 'granule_lists/GRE_2000_ANT_1000_Oct2018_Jun2023.csv'
n_granules = 1000

df = pd.read_csv(fn_in, header=None)
if n_granules == 1:
    df_small = df[df.apply(lambda x: 'ATL03_20220714010847' in x.loc[0], axis=1)]
else:
    idxs = np.random.choice(np.arange(0,len(df)), size=n_granules, replace=False)
    df_small = df.loc[idxs, :]
    
fn_out = fn_in.replace('.csv', '-%i.csv' % n_granules)
print(fn_out)

df_small.to_csv(fn_out, header=False, index=False)
df_small

In [None]:
# find the longest polygon to check if query code works with it
gjsn_dir = 'geojsons'
searchfor = 'simplified_GRE_2500'
gjsn_list = [gjsn_dir+'/'+f for f in os.listdir(gjsn_dir) \
            if os.path.isfile(os.path.join(gjsn_dir, f)) & (searchfor in f)]

searchfor = 'simplified_ANT_1500'
gjsn_list += [gjsn_dir+'/'+f for f in os.listdir(gjsn_dir) \
            if os.path.isfile(os.path.join(gjsn_dir, f)) & (searchfor in f)]

for geojson_filepath in gjsn_list:
    gdf = gpd.read_file(geojson_filepath)
    poly = orient(gdf.loc[0].geometry,sign=1.0)
    polygon = ','.join([str(c) for xy in zip(*poly.exterior.coords.xy) for c in xy])
    print('%4i'%len(polygon), geojson_filepath)

In [None]:
python3 detect_lakes.py --granule ATL03_20200302160852_10220610_006_01.h5 --polygon geojsons/simplified_ANT_1500_East_Dp-E.geojson

In [None]:
xp = np.linspace(2,8)
xp = np.array([])
fp = np.sin(xp)
x = np.linspace(0,10)
len(xp)
#np.interp(x, xp, fp, left=np.nan, right=np.nan)

In [None]:
# initialize list of dataframes
dflist = []

startyear = 2019
endyear = 2022
startday = '05-01'
endday = '09-30'
icesheet = 'GrIS'

searchfor = 'simplified_GRE_2500_CW'
gjsn_dir = 'geojsons'
gjsn_list = [gjsn_dir+'/'+f for f in os.listdir(gjsn_dir) \
            if os.path.isfile(os.path.join(gjsn_dir, f)) & (searchfor in f)]

for gjsn in gjsn_list:
    geojson = gjsn[gjsn.rfind('/')+1:]

    # gdf = gpd.read_file(gjsn)
    # print(gdf.geometry.loc[0].geom_type, geojson)
    
    for yr in np.arange(startyear, endyear+1):
        start_date = '%s-%s' % (yr, startday)
        end_date = '%s-%s' % (yr, endday)
        meltseason = start_date[:4] if start_date[:4]==end_date[:4] else start_date[:4] + '-' + end_date[2:4]
        outname = 'granule_lists/' + icesheet + '_' + meltseason + '_' + geojson.replace('.geojson','') + '.csv'
        make_granule_list(geojson, start_date, end_date, icesheet, meltseason, outname)
        dflist.append(pd.read_csv(outname,header=None))

df_all = pd.concat(dflist)
df_all.loc[:, 3] = df_all.apply(lambda x: x.loc[1].replace('simplified_', ''), axis=1)
df_all.loc[:, 2] = df_all.apply(lambda x: x.loc[2].replace('simplified_','').replace('GRE_','').replace('ANT_',''), axis=1)
df_all.to_csv('granule_lists/GRE_2500_CW_2019-22_.csv', header=False, index=False)
df_all_nosize = df_all.drop(columns='size_mb').copy()
df_all_nosize.to_csv('granule_lists/GRE_2000_ANT_1000_Oct2018_Jun2023.csv', header=False, index=False)
df_all_nosize

In [None]:
import geopandas as gpd
polygon = 'geojsons/simplified_GRE_2500_CW.geojson'
poly_nonsimplified = polygon.replace('simplified_', '')
poly_nonsimplified
clip_shape = gpd.read_file(poly_nonsimplified)
clip_shape

In [None]:
gdf = gpd.read_file('geojsons/simplified_GRE_2500_NO.geojson')
poly = orient(gdf.loc[0].geometry,sign=1.0)
    
#Format dictionary to polygon coordinate pairs for CMR polygon filtering
polygon = ','.join([str(c) for xy in zip(*poly.exterior.coords.xy) for c in xy])
polygon

# Greenland and Antarctica, all regions, Oct 2018 - March 2022

For GRE2500/ANT1500:
- Number of ganules over Greenland: 9325
- Number of ganules over Antarctica: 43790
- Total number of granules: 53115
- Largest granule: 12.2 GB, ATL03_20220511191525_07591505_006_01.h5, geojsons/simplified_GRE_2500_NW.geojson
- Total size: 141.03 TB 3 TB

For GRE2000/ANT1000:
- Number of ganules over Greenland: 8068
- Number of ganules over Antarctica: 39947
- Total number of granules: 48015
- Largest granule: 12.2 GB, ATL03_20220511191525_07591505_006_01.h5, geojsons/simplified_GRE_2000_NO.geojson
- Total size: 128.30 TB

In [None]:
# initialize list of dataframes
dflist = []

startyear = 2019
endyear = 2023
startday = '05-01'
endday = '09-30'
icesheet = 'GrIS'

searchfor = 'simplified_GRE_2000'
gjsn_dir = 'geojsons'
gjsn_list = [gjsn_dir+'/'+f for f in os.listdir(gjsn_dir) \
            if os.path.isfile(os.path.join(gjsn_dir, f)) & (searchfor in f)]
print('____________________________________________________________________________')
print('GREENLAND')
print('____________________________________________________________________________')

for i, gjsn in enumerate(gjsn_list):
    geojson = gjsn[gjsn.rfind('/')+1:]
    print('----------------------------------------------------------------')
    print(geojson, '(', i+1, '/', len(gjsn_list), ')')
    for yr in np.arange(startyear, endyear+1):
        start_date = '%s-%s' % (yr, startday)
        end_date = '%s-%s' % (yr, endday)
        meltseason = start_date[:4] if start_date[:4]==end_date[:4] else start_date[:4] + '-' + end_date[2:4]
        outname = 'granule_lists/' + icesheet + '_' + meltseason + '_' + geojson.replace('.geojson','') + '.csv'
        df = make_granule_list(geojson, start_date, end_date, icesheet, meltseason, outname)
        dflist.append(df)

startyear = 2018
endyear = 2023
startday = '11-01'
endday = '03-15'
icesheet = 'AIS'

searchfor = 'simplified_ANT_1000'
gjsn_dir = 'geojsons'
gjsn_list = [gjsn_dir+'/'+f for f in os.listdir(gjsn_dir) \
            if os.path.isfile(os.path.join(gjsn_dir, f)) & (searchfor in f)]

print('____________________________________________________________________________')
print('ANTARCTICA')
print('____________________________________________________________________________')
for i, gjsn in enumerate(gjsn_list):
    geojson = gjsn[gjsn.rfind('/')+1:]
    print('----------------------------------------------------------------')
    print(geojson, '(', i+1, '/', len(gjsn_list), ')')
    for yr in np.arange(startyear, endyear):
        start_date = '%s-%s' % (yr, startday)
        end_date = '%s-%s' % (yr+1, endday)
        meltseason = start_date[:4] if start_date[:4]==end_date[:4] else start_date[:4] + '-' + end_date[2:4]
        outname = 'granule_lists/' + icesheet + '_' + meltseason + '_' + geojson.replace('.geojson','') + '.csv'
        df = make_granule_list(geojson, start_date, end_date, icesheet, meltseason, outname)
        dflist.append(df)
    
df_all = pd.concat(dflist)
df_all.loc[:, 'description'] = df_all.apply(lambda x: x.loc['description'].replace('simplified_','').replace('GRE_','').replace('ANT_',''), axis=1)
df_all.reset_index(inplace=True, drop=True)
df_all.to_csv('granule_lists/GRE_2000_ANT_1000_Oct2018_Jun2023_size.csv', header=False, index=False)
print('Number of ganules over Greenland:', np.sum(df_all.apply(lambda x: 'GrIS' in x.loc['description'], axis=1)))
print('Number of ganules over Antarctica:', np.sum(df_all.apply(lambda x: 'AIS' in x.loc['description'], axis=1)))
print('Total number of granules:', len(df_all))
maxrow = df_all.loc[np.argmax(df_all.size_mb),:]
print('Largest granule: %.1f GB, %s, %s' % (maxrow.size_mb/1000, maxrow.granule, maxrow.geojson))
print('Total size: %.2f TB' % (np.sum(df_all.size_mb)/1e6))

In [None]:
df_all_nosize = df_all.drop(columns='size_mb').copy()
df_all_nosize.to_csv('granule_lists/GRE_2000_ANT_1000_Oct2018_Jun2023.csv', header=False, index=False)
df_all_nosize

In [None]:
df_all.loc[:, 'description'] = df_all.apply(lambda x: x.loc['description'].replace('simplified_','').replace('GRE_','').replace('ANT_',''), axis=1)
df_all.to_csv('granule_lists/GRE_2500_ANT_1500_Oct2018_Mar2023_size.csv', header=False, index=False)
df_all

In [None]:
df_all.to_csv('granule_lists/GRE_2500_ANT_1500_Oct2018_Mar2023_size.csv', header=False, index=False)

In [None]:
df_all

In [None]:
np.argmax(df_all.size_mb)

In [None]:
df_all.loc[np.argmax(df_all.size_mb),:]

In [None]:
df_all

In [None]:
geojson = 'jakobshavn_test.geojson'
icesheet = 'GrIS'
startyear = 2022
endyear = 2022
startday = '07-14'
endday = '07-14'
start_date = '%s-%s' % (startyear, startday)
end_date = '%s-%s' % (endyear, endday)


meltseason = start_date[:4] if start_date[:4]==end_date[:4] else start_date[:4] + '-' + end_date[2:4]
outname = 'zzz_test006.csv'

make_granule_list(geojson, start_date, end_date, icesheet, meltseason, outname)
pd.read_csv(outname,header=None)

In [None]:
# make sure that it worked by reading the file into a DataFrame and displaying it
import pandas as pd
pd.read_csv(outname,header=None)

# Jakobshavn + Amery + George VI all years

In [None]:
# set parameters and make the list
geojson = 'jakobshavn_test.geojson'
icesheet = 'GrIS'
startyear = 2019
endyear = 2022
startday = '05-15'
endday = '09-15'


meltseason = start_date[:4] if start_date[:4]==end_date[:4] else start_date[:4] + '-' + end_date[2:4]
outname = 'granule_lists/' + icesheet + '_' + meltseason + '_' + geojson.replace('.geojson','') + '.csv'

make_granule_list(geojson, start_date, end_date, icesheet, meltseason, outname)

In [None]:
# set parameters and make the list
geojson = 'west_greenland.geojson'
icesheet = 'GrIS'
startyear = 2019
endyear = 2022
startday = '05-15'
endday = '09-15'

dflist = []
for yr in np.arange(startyear, endyear+1):
    start_date = '%s-%s' % (yr, startday)
    end_date = '%s-%s' % (yr, endday)
    
    meltseason = start_date[:4] if start_date[:4]==end_date[:4] else start_date[:4] + '-' + end_date[2:4]
    outname = 'granule_lists/' + icesheet + '_' + meltseason + '_' + geojson.replace('.geojson','') + '.csv'

    make_granule_list(geojson, start_date, end_date, icesheet, meltseason, outname)
    dflist.append(pd.read_csv(outname,header=None))
    
geojson1 = 'george_vi.geojson'
geojson2 = 'amery.geojson'
icesheet = 'AIS'
startyear = 2018
endyear = 2021
startday = '11-15'
endday = '03-15'

for yr in np.arange(startyear, endyear+1):
    start_date = '%s-%s' % (yr, startday)
    end_date = '%s-%s' % (yr+1, endday)
    
    meltseason = start_date[:4] if start_date[:4]==end_date[:4] else start_date[:4] + '-' + end_date[2:4]
    outname1 = 'granule_lists/' + icesheet + '_' + meltseason + '_' + geojson1.replace('.geojson','') + '.csv'
    outname2 = 'granule_lists/' + icesheet + '_' + meltseason + '_' + geojson2.replace('.geojson','') + '.csv'
    
    make_granule_list(geojson1, start_date, end_date, icesheet, meltseason, outname1)
    make_granule_list(geojson2, start_date, end_date, icesheet, meltseason, outname2)
    
    dflist.append(pd.read_csv(outname1,header=None))
    dflist.append(pd.read_csv(outname2,header=None))
    
df_all = pd.concat(dflist)  
df_all.to_csv('granule_lists/wais-areas.csv', header=False, index=False)

# granule list from failed

In [None]:
df = pd.read_csv('failed_jobs/jobs_failed.csv', header=None)
def get_geo(x): 
    for area in ['amery', 'west_greenland', 'george_vi']:
        if area in x: 
            return 'geojsons/'+area+'.geojson', x[(x.find('job_')+4) : (x.find(area)+len(area))]
df['granule'] = df[0].map(lambda x : x[x.find('ATL03') : (x.find('.h5')+3)])
df['geo'], df['desc'] = list(zip(*df[0].map(get_geo)))
df.drop([0],inplace=True,axis=1)
df.to_csv('granule_lists/wais-areas_failed1.csv', header=False, index=False)

# granule list for all of WAIS melt regions

In [None]:
import os

icesheet = 'AIS'
startyear = 2018
endyear = 2021
startday = '11-01'
endday = '03-15'

searchfor = 'waismeltregions'
gjsn_dir = 'geojsons'
gjsn_list = [gjsn_dir+'/'+f for f in os.listdir(gjsn_dir) \
            if os.path.isfile(os.path.join(gjsn_dir, f)) & (searchfor in f)]

dflist = []
for gjsn in gjsn_list:
    geojson = gjsn[gjsn.find('/')+1:]
    for yr in np.arange(startyear, endyear+1):
        start_date = '%s-%s' % (yr, startday)
        end_date = '%s-%s' % (yr+1, endday)
        meltseason = start_date[:4] if start_date[:4]==end_date[:4] else start_date[:4] + '-' + end_date[2:4]
        outname = 'granule_lists/' + icesheet + '_' + meltseason + '_' + geojson.replace('.geojson','') + '.csv'
        make_granule_list(geojson, start_date, end_date, icesheet, meltseason, outname)
        dflist.append(pd.read_csv(outname,header=None))
    
df_all = pd.concat(dflist)
df_all.to_csv('granule_lists/waismeltregions.csv', header=False, index=False)

In [None]:
df['granule'] = 'granule'

In [None]:
df

In [None]:
# set parameters and make the list
geojson1 = 'george_vi.geojson'
geojson2 = 'amery.geojson'
icesheet = 'AIS'
startyear = 2018
endyear = 2021
startday = '11-15'
endday = '03-15'

dflist = []
for yr in np.arange(startyear, endyear+1):
    start_date = '%s-%s' % (yr, startday)
    end_date = '%s-%s' % (yr+1, endday)
    print(start_date, end_date)
    
    meltseason = start_date[:4] if start_date[:4]==end_date[:4] else start_date[:4] + '-' + end_date[2:4]
    outname1 = 'granule_lists/' + icesheet + '_' + meltseason + '_' + geojson1.replace('.geojson','') + '.csv'
    outname2 = 'granule_lists/' + icesheet + '_' + meltseason + '_' + geojson2.replace('.geojson','') + '.csv'
    
    make_granule_list(geojson1, start_date, end_date, icesheet, meltseason, outname1)
    make_granule_list(geojson2, start_date, end_date, icesheet, meltseason, outname2)
    
    dflist.append(pd.read_csv(outname1,header=None))
    dflist.append(pd.read_csv(outname2,header=None))
    
df_all = pd.concat(dflist)
df_all