# IFT-pipeline evaluation table
Now that there is a functional pipeline, we can use it to identify where various stages of the pipeline fail to guide development. This notebook checks the output of each subfolder given a site specification table and a results folder.

## Steps that we are checking
1. soit
2. landmask
3. preprocess
4. extractfeatures
5. tracking
6. exportH5

The first four are applied linearly, so the maximum number of successes for step `i` is the number of successes at step `i-1`. Here, we are considering a success if the task was completed with error, i.e., *something* is in the appropriate folder.

TBD: Consider replacing this notebook with a section in the README and a script.

In [11]:
import numpy as np
import os
import pandas as pd

In [15]:
runs = {'baffin_bay': [],
        'beaufort_sea': [],
        'barents-kara_seas': [],
        'chukchi-east_siberian_sea': [],
        'greenland_sea': [],
        'hudson_bay': [],
        'laptev_sea': [],
        'sea_of_okhostk': []}
for region in runs:
    fnames = os.listdir('../data/ift_results/' + region)
    for file in fnames:
        if file[0] != '.':
            runs[region].append(file)

In [24]:
results = {}
for region in runs:
    results[region] = {}
    for timestamp in runs[region]:   
        site_locations = pd.read_csv('../data/ift_case_definitions/' + region + '_100km_cases.csv', index_col='location')
        results_folder = region + '/' + timestamp
        results_loc = '../data/ift_results/' + results_folder
    
        # soit successes
        site_locations['soit'] = 'NA'
        for case in site_locations.index:
            if 'soit' in os.listdir(results_loc + '/' + case):
                if len(os.listdir(results_loc + '/' + case + '/soit' )) > 0:
                    site_locations.loc[case, 'soit'] = 'pass'
                else:
                    site_locations.loc[case, 'soit'] = 'fail'
        
        # landmask successes
        site_locations['landmask'] = 'NA'
        for case in site_locations.index:
            files = [x for x in os.listdir(results_loc + '/' + case + '/landmasks/') if x != '.DS_Store']
            if len(files) != 0:
                site_locations.loc[case, 'landmask'] = 'pass'
            elif site_locations.loc[case, 'soit'] == 'pass':
                site_locations.loc[case, 'landmask'] = 'fail'
        
        # preprocessing successes
        # here, slightly different check. hdf5-files will always be there.
        site_locations['preprocess'] = 'NA'
        site_locations['extractH5'] = 'NA'
        site_locations['tracker'] = 'NA'
        for case in site_locations.index:
            files = [x for x in os.listdir(results_loc + '/' + case + '/preprocess/') if x not in ['.DS_Store', 'hdf5-files']]
            if len(files) != 0:
                site_locations.loc[case, 'preprocess'] = 'pass'
                h5files = [x for x in os.listdir(results_loc + '/' + case + '/preprocess/hdf5-files') if x != '.DS_Store']
        
                # Check h5 and tracker if it passes the preprocess step
                if len(h5files) != 0:
                    site_locations.loc[case, 'extractH5'] = 'pass'
                else:
                    site_locations.loc[case, 'extractH5'] = 'fail'
                trfiles = [x for x in os.listdir(results_loc + '/' + case + '/tracker') if x != '.DS_Store']            
                if len(trfiles) != 0:
                    site_locations.loc[case, 'tracker'] = 'pass'
                else:
                    site_locations.loc[case, 'tracker'] = 'fail'            
        
            elif site_locations.loc[case, 'soit'] == 'pass':
                if site_locations.loc[case, 'landmask'] == 'pass': 
                    site_locations.loc[case, 'preprocess'] = 'fail'
        
        site_locations.loc[:,['soit', 'landmask', 'preprocess', 'extractH5', 'tracker']].to_csv(
            results_loc + '/' + region + '_evaluation_table.csv')
        results[region][timestamp] = site_locations.loc[:,['soit', 'landmask', 'preprocess', 'extractH5', 'tracker']]

In [27]:
attempted = {}
for region in results:
    attempted[region] = {}
    for timestamp in results[region]:
        attempted[region][timestamp] = pd.concat([
            (results[region][timestamp].loc[:,['soit', 'landmask', 'preprocess', 'extractH5', 'tracker']] == 'pass').sum(axis=0),
            (results[region][timestamp].loc[:,['soit', 'landmask', 'preprocess', 'extractH5', 'tracker']] == 'fail').sum(axis=0)], axis=1)
        attempted[region][timestamp].columns = ['pass', 'fail']
        attempted[region][timestamp]['attempted'] = attempted[region][timestamp]['pass'] + attempted[region][timestamp]['fail']
# attempted = pd.concat(attempted) 
# attempted = attempted.reset_index()
# attempted.rename({'level_0': 'region','level_1': 'task'}, axis=1, inplace=True)

In [20]:
order = ['soit', 'landmask', 'preprocess', 'extractH5', 'tracker']
attempted['fail_fraction'] = attempted['fail'] / attempted['attempted']
attempted.pivot_table(index='region', columns='task', values='fail_fraction').loc[:, order].round(2)
print('Fraction of failed attempts')

Fraction of failed attempts


In [33]:
region = 'chukchi-east_siberian_sea'
for timestamp in attempted[region]:
    print(attempted[region][timestamp])

            pass  fail  attempted
soit          19     2         21
landmask      19     0         19
preprocess     2    17         19
extractH5      2     0          2
tracker        1     1          2
            pass  fail  attempted
soit          19     2         21
landmask      19     0         19
preprocess     2    17         19
extractH5      2     0          2
tracker        1     1          2


In [35]:
start_dates = [m.strftime('%Y-%m-%d') for m in pd.date_range('2019-10-01', '2021-05-06', freq='1MS')]
end_dates = [m.strftime('%Y-%m-%d') for m in pd.date_range('2019-10-01', '2021-05-06', freq='1ME')]
end_dates

['2019-10-31',
 '2019-11-30',
 '2019-12-31',
 '2020-01-31',
 '2020-02-29',
 '2020-03-31',
 '2020-04-30',
 '2020-05-31',
 '2020-06-30',
 '2020-07-31',
 '2020-08-31',
 '2020-09-30',
 '2020-10-31',
 '2020-11-30',
 '2020-12-31',
 '2021-01-31',
 '2021-02-28',
 '2021-03-31',
 '2021-04-30']