In [26]:
import pandas as pd
import numpy as np

In [29]:
def write_granule_list(input_list):
    df = pd.read_csv(input_list, header=None, names=['fn','hold_reason'])
    df['is_memory'] = df.apply(lambda x: 'exceeded request_memory' in x.hold_reason, axis=1)
    df['granule'] = df.apply(lambda x: x.fn[x.fn.find('ATL03_'):x.fn.find('.h5')+3], axis=1)
    def get_description(x):
        substr = x.fn[x.fn.find('job_')+4:x.fn.find('_ATL03_')]
        return substr[:substr.rfind('-')]
    df['description'] = df.apply(get_description, axis=1)
    def get_geojson(x):
        parms = x.description.split('_')
        parms[0] = 'ANT' if parms[0] == 'AIS' else 'GRE'
        del parms[1]
        return 'geojsons/simplified_' + '_'.join(parms) + '.geojson'
    df['geojson'] = df.apply(get_geojson, axis=1)
    df['geojson_full'] = df.apply(lambda x: x.geojson.replace('simplified_', ''), axis=1)
    df = df[['granule','geojson','description','geojson_full','fn','hold_reason','is_memory']]
    df.to_csv(input_list.replace('.csv', '_processed.csv'))
    
    df_mem = df[df.is_memory]
    df_nomem = df[~df.is_memory]
    
    df_mem = df_mem.drop(columns=['fn','hold_reason','is_memory'])
    df_nomem = df_nomem.drop(columns=['fn','hold_reason','is_memory'])
    df_resub = pd.concat((df_mem, df_nomem)).reset_index(drop=True)
    
    fn_mem = input_list.replace('hold_lists/', 'granule_lists/').replace('final_', 'memory_')
    fn_nomem = input_list.replace('hold_lists/', 'granule_lists/').replace('final_', 'others_')
    fn_resub = input_list.replace('hold_lists/', 'granule_lists/').replace('final_', 'resubmit_')
    
    df_mem.to_csv(fn_mem, header=False, index=False)
    print('Wrote file %s. (%i jobs)' % (fn_mem, len(df_mem)))
    df_nomem.to_csv(fn_nomem, header=False, index=False)
    print('Wrote file %s. (%i jobs)' % (fn_nomem, len(df_nomem)))
    df_resub.to_csv(fn_resub, header=False, index=False)
    print('Wrote file %s. (%i jobs)' % (fn_resub, len(df_resub)))

    return fn_mem, fn_nomem, fn_resub

def write_submit_file(list_fn, sub_fn=None, mem_gb=16, stream_error=False, stream_output=False): 
    if not sub_fn:
        sub_fn = list_fn.replace('granule_lists/', 'HTCondor_submit/').replace('.csv', '.submit')
    
    f = open(sub_fn, "w")
    print('universe    = vanilla', file=f)
    print('+SingularityImage = "osdf:///ospool/ap21/data/fliphilipp/containers/icelake-container_v1.sif"', file=f)
    print('Requirements = HAS_SINGULARITY == True && OSG_HOST_KERNEL_VERSION >= 31000', file=f)
    print('executable  = run_py.sh', file=f)
    print('arguments = $(granule) $(polygon)', file=f)
    print('max_retries = 30', file=f)
    print('success_exit_code = 69', file=f)
    print('transfer_input_files = detect_lakes.py, icelakes/__init__.py, icelakes/utilities.py, icelakes/nsidc.py, icelakes/detection.py, misc/test1, misc/test2, $(polygon), $(polygon_full)', file=f)
    print('transfer_output_files = detection_out_data, detection_out_plot, detection_out_stat', file=f)
    print('should_transfer_files = YES', file=f)
    print('when_to_transfer_output = ON_EXIT', file=f)
    print('periodic_release = (HoldReasonCode == 13)', file=f)
    print('log           = logs/job_$(descriptor)-$(ClusterID)_$(granule)-$(ProcID).log', file=f)
    print('error         = errs/job_$(descriptor)-$(ClusterID)_$(granule)-$(ProcID).err', file=f)
    print('output        = outs/job_$(descriptor)-$(ClusterID)_$(granule)-$(ProcID).out', file=f)
    if stream_error:
        print('stream_error = True', file=f)
    if stream_output:
        print('stream_output = True', file=f)
    print('request_cpus    = 1', file=f)
    print('request_memory  = %iGB' % mem_gb, file=f)
    print('request_disk    = %iGB' % mem_gb, file=f)
    print('queue granule,polygon,descriptor,polygon_full from %s' % list_fn, file=f)
    f.close()
    
    print('Wrote file %s.\n' % sub_fn)
    return sub_fn

In [30]:
! ls hold_lists

final_GLD3-test-15197642.csv           final_GLD3-test-15197642_processed.csv


In [31]:
input_list = 'hold_lists/final_GLD3-test-15197642.csv'
fn_mem, fn_nomem, fn_resub = write_granule_list(input_list)
submit_fn = write_submit_file(fn_resub, mem_gb=32)
! cat $submit_fn

Wrote file granule_lists/memory_GLD3-test-15197642.csv. (15 jobs)
Wrote file granule_lists/others_GLD3-test-15197642.csv. (6 jobs)
Wrote file granule_lists/resubmit_GLD3-test-15197642.csv. (21 jobs)
Wrote file HTCondor_submit/resubmit_GLD3-test-15197642.submit.

universe    = vanilla
+SingularityImage = "osdf:///ospool/ap21/data/fliphilipp/containers/icelake-container_v1.sif"
Requirements = HAS_SINGULARITY == True && OSG_HOST_KERNEL_VERSION >= 31000
executable  = run_py.sh
arguments = $(granule) $(polygon)
max_retries = 30
success_exit_code = 69
transfer_input_files = detect_lakes.py, icelakes/__init__.py, icelakes/utilities.py, icelakes/nsidc.py, icelakes/detection.py, misc/test1, misc/test2, $(polygon), $(polygon_full)
transfer_output_files = detection_out_data, detection_out_plot, detection_out_stat
should_transfer_files = YES
when_to_transfer_output = ON_EXIT
periodic_release = (HoldReasonCode == 13)
log           = logs/job_$(descriptor)-$(ClusterID)_$(granule)-$(ProcID).log
error

## Inital submit files

In [3]:
write_submit_file('granule_lists/GRE_2000_May2019_Jun2023.csv', mem_gb=16)
write_submit_file('granule_lists/ANT_1000_Dec2018_Mar2021.csv', mem_gb=16)
write_submit_file('granule_lists/ANT_1000_Dec2021_Mar2023.csv', mem_gb=16)

Wrote file HTCondor_submit/GRE_2000_May2019_Jun2023.submit.

Wrote file HTCondor_submit/ANT_1000_Dec2018_Mar2021.submit.

Wrote file HTCondor_submit/ANT_1000_Dec2021_Mar2023.submit.



In [4]:
write_submit_file('granule_lists/extra_shoulderseason_GRE_2000_ANT_1000.csv', mem_gb=16)

Wrote file HTCondor_submit/extra_shoulderseason_GRE_2000_ANT_1000.submit.



In [19]:
write_submit_file('granule_lists/GRE_2000_May2023_Sep2023_newdata.csv', mem_gb=16)

Wrote file HTCondor_submit/GRE_2000_May2023_Sep2023_newdata.submit.



In [8]:
write_submit_file('granule_lists/newdata_gre-2023_test-2.csv', mem_gb=16, stream_error=True, stream_output=True)

Wrote file HTCondor_submit/newdata_gre-2023_test-2.submit.



In [6]:
input_list = 'hold_lists/final_gris-2023-run1-15175718.csv'
fn_mem, fn_nomem = write_granule_list(input_list)

Wrote file granule_lists/memory_gris-2023-run1-15175718.csv. (0 jobs)
Wrote file granule_lists/resubmit_gris-2023-run1-15175718.csv. (36 jobs)


In [16]:
write_submit_file('granule_lists/resubmit_gris-2023-run1-15175718.csv', mem_gb=32)

Wrote file HTCondor_submit/resubmit_gris-2023-run1-15175718.submit.



In [17]:
! cat HTCondor_submit/resubmit_gris-2023-run1-15175718.submit

universe    = vanilla
+SingularityImage = "osdf:///ospool/ap21/data/fliphilipp/containers/icelake-container_v1.sif"
Requirements = HAS_SINGULARITY == True && OSG_HOST_KERNEL_VERSION >= 31000
executable  = run_py.sh
arguments = $(granule) $(polygon)
max_retries = 30
success_exit_code = 69
transfer_input_files = detect_lakes.py, icelakes/__init__.py, icelakes/utilities.py, icelakes/nsidc.py, icelakes/detection.py, misc/test1, misc/test2, $(polygon), $(polygon_full)
transfer_output_files = detection_out_data, detection_out_plot, detection_out_stat
should_transfer_files = YES
when_to_transfer_output = ON_EXIT
periodic_release = (HoldReasonCode == 13)
log           = logs/job_$(descriptor)-$(ClusterID)_$(granule)-$(ProcID).log
error         = errs/job_$(descriptor)-$(ClusterID)_$(granule)-$(ProcID).err
output        = outs/job_$(descriptor)-$(ClusterID)_$(granule)-$(ProcID).out
request_cpus    = 1
request_memory  = 32GB
request_disk    = 32GB
queue granule,polygon,descriptor,polygon_full fr

In [18]:
! ls  HTCondor_submit

ANT_1000_Dec2018_Mar2021.submit
ANT_1000_Dec2021_Mar2023.submit
GRE_2000_May2019_Jun2023.submit
GRE_2000_May2023_Sep2023_newdata.submit
extra_shoulderseason_GRE_2000_ANT_1000.submit
gris-2023-run1.submit
memory_resubmit_all_2.submit
newdata_gre-2023_test-15.submit
newdata_gre-2023_test-2.submit
resubmit_all_2.submit
resubmit_gris-2023-run1-15175718.submit
z_resubmit_remaining91.submit
zz_resubmit_remaining45.submit


## TODO before next run: 
- make sure code runs properly locally
- then change sleep time to 1000 or something 

In [3]:
! ls hold_lists

final_ant-2123-1-138198.csv
final_ant-2123-1-138198_processed.csv
final_ant1821-1-138197.csv
final_ant1821-1-138197_processed.csv
final_extra-months-1-169920.csv
final_extra-months-1-169920_processed.csv
final_gre1823-1-138199.csv
final_gre1823-1-138199_processed.csv
final_gris-2023-run1-15175718.csv
final_z_all16gb_rerun-197770.csv
final_z_all16gb_rerun-197770_processed.csv
final_z_mem32gb_rerun-197768.csv
final_z_mem32gb_rerun-197768_processed.csv
final_zz_rerun-last45-200378.csv
final_zz_rerun-last45-200378_processed.csv
final_zzz_last_attempt-2274115.csv
final_zzz_last_attempt-2274115_processed.csv


In [6]:
input_list = 'hold_lists/final_ant1821-1-138197.csv'
fn_mem, fn_nomem = write_granule_list(input_list)

Wrote file granule_lists/memory_ant1821-1-138197.csv. (91 jobs)
Wrote file granule_lists/resubmit_ant1821-1-138197.csv. (1763 jobs)


In [8]:
input_list = 'hold_lists/final_ant-2123-1-138198.csv'
fn_mem, fn_nomem = write_granule_list(input_list)

Wrote file granule_lists/memory_ant-2123-1-138198.csv. (53 jobs)
Wrote file granule_lists/resubmit_ant-2123-1-138198.csv. (56 jobs)


In [9]:
input_list = 'hold_lists/final_gre1823-1-138199.csv'
fn_mem, fn_nomem = write_granule_list(input_list)

Wrote file granule_lists/memory_gre1823-1-138199.csv. (133 jobs)
Wrote file granule_lists/resubmit_gre1823-1-138199.csv. (867 jobs)


In [10]:
input_list = 'hold_lists/final_extra-months-1-169920.csv'
fn_mem, fn_nomem = write_granule_list(input_list)

Wrote file granule_lists/memory_extra-months-1-169920.csv. (40 jobs)
Wrote file granule_lists/resubmit_extra-months-1-169920.csv. (1294 jobs)


In [22]:
file_list_resub1 = [
    'granule_lists/resubmit_ant1821-1-138197.csv',
    'granule_lists/resubmit_ant-2123-1-138198.csv',
    'granule_lists/resubmit_gre1823-1-138199.csv',
    'granule_lists/resubmit_extra-months-1-169920.csv'
]
out_name = 'granule_lists/resubmit_all_2.csv'
dfs = []
for fn in file_list_resub1:
    dfs.append(pd.read_csv(fn,header=None))
dfb = pd.concat(dfs).reset_index(drop=True)
dfb.to_csv(out_name, header=False, index=False)

print(out_name)
print('Number of ganules over Greenland:', np.sum(dfb.apply(lambda x: 'GrIS' in x.loc[2], axis=1)))
print('Number of ganules over Antarctica:', np.sum(dfb.apply(lambda x: 'AIS' in x.loc[2], axis=1)))
print('Total number of granules:', len(dfb))
write_submit_file(out_name, mem_gb=16)

granule_lists/resubmit_all_2.csv
Number of ganules over Greenland: 1980
Number of ganules over Antarctica: 2000
Total number of granules: 3980
Wrote file HTCondor_submit/resubmit_all_2.submit.



In [23]:
file_list_resub1 = [
    'granule_lists/memory_ant1821-1-138197.csv',
    'granule_lists/memory_ant-2123-1-138198.csv',
    'granule_lists/memory_gre1823-1-138199.csv',
    'granule_lists/memory_extra-months-1-169920.csv'
]
out_name = 'granule_lists/memory_resubmit_all_2.csv'
dfs = []
for fn in file_list_resub1:
    dfs.append(pd.read_csv(fn,header=None))
dfb = pd.concat(dfs).reset_index(drop=True)
dfb.to_csv(out_name, header=False, index=False)

print(out_name)
print('Number of ganules over Greenland:', np.sum(dfb.apply(lambda x: 'GrIS' in x.loc[2], axis=1)))
print('Number of ganules over Antarctica:', np.sum(dfb.apply(lambda x: 'AIS' in x.loc[2], axis=1)))
print('Total number of granules:', len(dfb))
write_submit_file(out_name, mem_gb=32)

granule_lists/memory_resubmit_all_2.csv
Number of ganules over Greenland: 135
Number of ganules over Antarctica: 182
Total number of granules: 317
Wrote file HTCondor_submit/memory_resubmit_all_2.submit.



In [26]:
! ls hold_lists

final_ant-2123-1-138198.csv
final_ant-2123-1-138198_processed.csv
final_ant1821-1-138197.csv
final_ant1821-1-138197_processed.csv
final_extra-months-1-169920.csv
final_extra-months-1-169920_processed.csv
final_gre1823-1-138199.csv
final_gre1823-1-138199_processed.csv
final_z_all16gb_rerun-197770.csv
final_z_mem32gb_rerun-197768.csv


In [27]:
input_list = 'hold_lists/final_z_all16gb_rerun-197770.csv'
fn_mem, fn_nomem = write_granule_list(input_list)
input_list = 'hold_lists/final_z_mem32gb_rerun-197768.csv'
fn_mem, fn_nomem = write_granule_list(input_list)

Wrote file granule_lists/memory_z_all16gb_rerun-197770.csv. (35 jobs)
Wrote file granule_lists/resubmit_z_all16gb_rerun-197770.csv. (50 jobs)
Wrote file granule_lists/memory_z_mem32gb_rerun-197768.csv. (4 jobs)
Wrote file granule_lists/resubmit_z_mem32gb_rerun-197768.csv. (2 jobs)


In [28]:
file_list_resub2 = [
    'granule_lists/memory_z_all16gb_rerun-197770.csv',
    'granule_lists/resubmit_z_all16gb_rerun-197770.csv',
    'granule_lists/memory_z_mem32gb_rerun-197768.csv',
    'granule_lists/resubmit_z_mem32gb_rerun-197768.csv'
]
out_name = 'granule_lists/z_resubmit_remaining91.csv'
dfs = []
for fn in file_list_resub2:
    dfs.append(pd.read_csv(fn,header=None))
dfb = pd.concat(dfs).reset_index(drop=True)
dfb.to_csv(out_name, header=False, index=False)

print(out_name)
print('Number of ganules over Greenland:', np.sum(dfb.apply(lambda x: 'GrIS' in x.loc[2], axis=1)))
print('Number of ganules over Antarctica:', np.sum(dfb.apply(lambda x: 'AIS' in x.loc[2], axis=1)))
print('Total number of granules:', len(dfb))
write_submit_file(out_name, mem_gb=32)

granule_lists/z_resubmit_remaining91.csv
Number of ganules over Greenland: 60
Number of ganules over Antarctica: 31
Total number of granules: 91
Wrote file HTCondor_submit/z_resubmit_remaining91.submit.



In [30]:
! ls hold_lists

final_ant-2123-1-138198.csv
final_ant-2123-1-138198_processed.csv
final_ant1821-1-138197.csv
final_ant1821-1-138197_processed.csv
final_extra-months-1-169920.csv
final_extra-months-1-169920_processed.csv
final_gre1823-1-138199.csv
final_gre1823-1-138199_processed.csv
final_z_all16gb_rerun-197770.csv
final_z_all16gb_rerun-197770_processed.csv
final_z_mem32gb_rerun-197768.csv
final_z_mem32gb_rerun-197768_processed.csv
final_zz_rerun-last45-200378.csv


In [10]:
input_list = 'hold_lists/final_zz_rerun-last45-200378.csv'
fn_mem, fn_nomem = write_granule_list(input_list)

Wrote file granule_lists/memory_zz_rerun-last45-200378.csv. (0 jobs)
Wrote file granule_lists/resubmit_zz_rerun-last45-200378.csv. (45 jobs)


In [11]:
out_name = 'granule_lists/zz_resubmit_remaining45.csv'
dfb = pd.read_csv(fn_nomem,header=None)
dfb.to_csv(out_name, header=False, index=False)

print(out_name)
print('Number of ganules over Greenland:', np.sum(dfb.apply(lambda x: 'GrIS' in x.loc[2], axis=1)))
print('Number of ganules over Antarctica:', np.sum(dfb.apply(lambda x: 'AIS' in x.loc[2], axis=1)))
print('Total number of granules:', len(dfb))
write_submit_file(out_name, mem_gb=64)

granule_lists/zz_resubmit_remaining45.csv
Number of ganules over Greenland: 33
Number of ganules over Antarctica: 12
Total number of granules: 45
Wrote file HTCondor_submit/zz_resubmit_remaining45.submit.



In [12]:
! ls hold_lists

final_ant-2123-1-138198.csv
final_ant-2123-1-138198_processed.csv
final_ant1821-1-138197.csv
final_ant1821-1-138197_processed.csv
final_extra-months-1-169920.csv
final_extra-months-1-169920_processed.csv
final_gre1823-1-138199.csv
final_gre1823-1-138199_processed.csv
final_z_all16gb_rerun-197770.csv
final_z_all16gb_rerun-197770_processed.csv
final_z_mem32gb_rerun-197768.csv
final_z_mem32gb_rerun-197768_processed.csv
final_zz_rerun-last45-200378.csv
final_zz_rerun-last45-200378_processed.csv
final_zzz_last_attempt-2274115.csv
final_zzz_last_attempt-2274115_processed.csv


In [13]:
input_list = 'hold_lists/final_zzz_last_attempt-2274115.csv'
fn_mem, fn_nomem = write_granule_list(input_list)

Wrote file granule_lists/memory_zzz_last_attempt-2274115.csv. (0 jobs)
Wrote file granule_lists/resubmit_zzz_last_attempt-2274115.csv. (44 jobs)


In [14]:
out_name = 'granule_lists/zzz_NOresubmit_remaining44.csv'
dfb = pd.read_csv(fn_nomem,header=None)
dfb.to_csv(out_name, header=False, index=False)

print(out_name)
print('Number of ganules over Greenland:', np.sum(dfb.apply(lambda x: 'GrIS' in x.loc[2], axis=1)))
print('Number of ganules over Antarctica:', np.sum(dfb.apply(lambda x: 'AIS' in x.loc[2], axis=1)))
print('Total number of granules:', len(dfb))
print('NO SUBMIT FILE WRITTEN - re-run these locally, or just ignore!')

granule_lists/zzz_NOresubmit_remaining44.csv
Number of ganules over Greenland: 32
Number of ganules over Antarctica: 12
Total number of granules: 44
NO SUBMIT FILE WRITTEN - re-run these locally, or just ignore!


In [15]:
! cat granule_lists/zzz_NOresubmit_remaining44.csv

ATL03_20200206111502_06370611_006_01.h5,geojsons/simplified_ANT_1000_West_Ep-F.geojson,AIS_2019-20_1000_West_Ep-F,geojsons/ANT_1000_West_Ep-F.geojson
ATL03_20200301041337_09990612_006_01.h5,geojsons/simplified_ANT_1000_West_Ep-F.geojson,AIS_2019-20_1000_West_Ep-F,geojsons/ANT_1000_West_Ep-F.geojson
ATL03_20201203221707_10800910_006_01.h5,geojsons/simplified_ANT_1000_West_Ep-F.geojson,AIS_2020-21_1000_West_Ep-F,geojsons/ANT_1000_West_Ep-F.geojson
ATL03_20190122150622_03840212_006_02.h5,geojsons/simplified_ANT_1000_Peninsula_Ipp-J.geojson,AIS_2018-19_1000_Peninsula_Ipp-J,geojsons/ANT_1000_Peninsula_Ipp-J.geojson
ATL03_20190223103751_08700210_006_02.h5,geojsons/simplified_ANT_1000_East_Dp-E.geojson,AIS_2018-19_1000_East_Dp-E,geojsons/ANT_1000_East_Dp-E.geojson
ATL03_20200119062813_03590612_006_01.h5,geojsons/simplified_ANT_1000_East_Dp-E.geojson,AIS_2019-20_1000_East_Dp-E,geojsons/ANT_1000_East_Dp-E.geojson
ATL03_20201211155644_11980912_006_01.h5,geojsons/simplified_ANT_1000_East_Dp-E.geo

In [None]:
input_list = 'hold_lists/final_greenland-1-114211.csv'
fn_mem, fn_nomem = write_granule_list(input_list)
write_submit_file(fn_nomem)

In [None]:
input_list = 'hold_lists/final_antarctica-18-21-1-114218.csv'
fn_mem, fn_nomem = write_granule_list(input_list)
write_submit_file(fn_nomem)

In [None]:
input_list = 'hold_lists/final_antarctica-21-23-1-114219.csv'
fn_mem, fn_nomem = write_granule_list(input_list)
write_submit_file(fn_nomem)

In [None]:
input_list = 'hold_lists/final_greenland-2-117771.csv'
fn_mem, fn_nomem = write_granule_list(input_list)

In [None]:
input_list = 'hold_lists/final_antarctica-18-21-2-123983.csv'
fn_mem, fn_nomem = write_granule_list(input_list)

In [None]:
input_list = 'hold_lists/final_antarctica-21-23-2-123993.csv'
fn_mem, fn_nomem = write_granule_list(input_list)

In [None]:
outname = 'granule_lists/resubmit_all-3.csv'
filelist_2nds = [
    'granule_lists/resubmit_greenland-2-117771.csv',
    'granule_lists/resubmit_antarctica-18-21-2-123983.csv',
    'granule_lists/resubmit_antarctica-21-23-2-123993.csv'
]
dfs = []
for fn in filelist_2nds:
    df = pd.read_csv(fn, header=None)
    print(len(df), 'jobs', fn)
    dfs.append(df)
df_all = pd.concat(dfs)
df_all.to_csv(outname, header=False, index=False)
print('Wrote file %s. (%i jobs)' % (outname, len(df_all)))
write_submit_file(outname, mem_gb=16)

In [None]:
input_list = 'hold_lists/final_resubmit_all-4-135549.csv'
fn_mem, fn_nomem = write_granule_list(input_list)
write_submit_file(fn_nomem)

In [None]:
outname = 'granule_lists/memory_resubmit_all-4.csv'
filelist_mem = [
    'granule_lists/memory_antarctica-18-21-1-114218.csv',
    'granule_lists/memory_antarctica-18-21-2-123983.csv',
    # 'granule_lists/memory_antarctica-21-23-1-114219.csv', # no jobs
    'granule_lists/memory_antarctica-21-23-2-123993.csv',
    'granule_lists/memory_greenland-1-114211.csv',
    'granule_lists/memory_greenland-2-117771.csv',
    'granule_lists/memory_resubmit_all-4-135549.csv'
]
dfs = []
for fn in filelist_mem:
    df = pd.read_csv(fn, header=None)
    print(len(df), 'jobs', fn)
    dfs.append(df)
df_all = pd.concat(dfs)
df_all.to_csv(outname, header=False, index=False)
print('Wrote file %s. (%i jobs)' % (outname, len(df_all)))
write_submit_file(outname)