# IFT-pipeline evaluation table
Now that there is a functional pipeline, we can use it to identify where various stages of the pipeline fail to guide development. This notebook checks the output of each subfolder given a site specification table and a results folder.

## Steps that we are checking
1. soit
2. landmask
3. preprocess
4. extractfeatures
5. tracking
6. exportH5

The first four are applied linearly, so the maximum number of successes for step `i` is the number of successes at step `i-1`. Here, we are considering a success if the task was completed with error, i.e., *something* is in the appropriate folder.

TBD: Consider replacing this notebook with a section in the README and a script.

In [10]:
import numpy as np
import os
import pandas as pd

In [17]:
results = {}
for region, timestamp in zip(
    ['baffin_bay', 'beaufort_sea', 'barents-kara_seas', 'chukchi-east_siberian_sea', 'greenland_sea'], 
    ['20240124T1406Z', '20240124T1618Z', '20240221T1715Z', '20240223T1406Z', '20240223T1604Z']):

    site_locations = pd.read_csv('../data/ift_case_definitions/' + region + '_100km_cases.csv', index_col='location')
    results_folder = region + '_' + timestamp
    results_loc = '../data/ift_results/' + results_folder

    # soit successes
    site_locations['soit'] = 'NA'
    for case in site_locations.index:
        if 'soit' in os.listdir(results_loc + '/' + case):
            if len(os.listdir(results_loc + '/' + case + '/soit' )) > 0:
                site_locations.loc[case, 'soit'] = 'pass'
            else:
                site_locations.loc[case, 'soit'] = 'fail'
    
    # landmask successes
    site_locations['landmask'] = 'NA'
    for case in site_locations.index:
        files = [x for x in os.listdir(results_loc + '/' + case + '/landmasks/') if x != '.DS_Store']
        if len(files) != 0:
            site_locations.loc[case, 'landmask'] = 'pass'
        elif site_locations.loc[case, 'soit'] == 'pass':
            site_locations.loc[case, 'landmask'] = 'fail'
    
    # preprocessing successes
    # here, slightly different check. hdf5-files will always be there.
    site_locations['preprocess'] = 'NA'
    site_locations['extractH5'] = 'NA'
    site_locations['tracker'] = 'NA'
    for case in site_locations.index:
        files = [x for x in os.listdir(results_loc + '/' + case + '/preprocess/') if x not in ['.DS_Store', 'hdf5-files']]
        if len(files) != 0:
            site_locations.loc[case, 'preprocess'] = 'pass'
            h5files = [x for x in os.listdir(results_loc + '/' + case + '/preprocess/hdf5-files') if x != '.DS_Store']
    
            # Check h5 and tracker if it passes the preprocess step
            if len(h5files) != 0:
                site_locations.loc[case, 'extractH5'] = 'pass'
            else:
                site_locations.loc[case, 'extractH5'] = 'fail'
            trfiles = [x for x in os.listdir(results_loc + '/' + case + '/tracker') if x != '.DS_Store']            
            if len(trfiles) != 0:
                site_locations.loc[case, 'tracker'] = 'pass'
            else:
                site_locations.loc[case, 'tracker'] = 'fail'            
    
        elif site_locations.loc[case, 'soit'] == 'pass':
            if site_locations.loc[case, 'landmask'] == 'pass': 
                site_locations.loc[case, 'preprocess'] = 'fail'
    
    site_locations.loc[:,['soit', 'landmask', 'preprocess', 'extractH5', 'tracker']].to_csv(results_loc + region + '_evaluation_table.csv')
    results[region] = site_locations.loc[:,['soit', 'landmask', 'preprocess', 'extractH5', 'tracker']]

In [33]:
attempted = {}
for region in results:
    attempted[region] = pd.concat([
        (results[region].loc[:,['soit', 'landmask', 'preprocess', 'extractH5', 'tracker']] == 'pass').sum(axis=0),
        (results[region].loc[:,['soit', 'landmask', 'preprocess', 'extractH5', 'tracker']] == 'fail').sum(axis=0)], axis=1)
    attempted[region].columns = ['pass', 'fail']
    attempted[region]['attempted'] = attempted[region]['pass'] + attempted[region]['fail']
attempted = pd.concat(attempted) 
attempted = attempted.reset_index()
attempted.rename({'level_0': 'region','level_1': 'task'}, axis=1, inplace=True)

In [42]:
order = ['soit', 'landmask', 'preprocess', 'extractH5', 'tracker']
attempted['fail_fraction'] = attempted['fail'] / attempted['attempted']
attempted.pivot_table(index='region', columns='task', values='fail_fraction').loc[:, order].round(2)

task,soit,landmask,preprocess,extractH5,tracker
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
baffin_bay,0.05,0.0,0.6,0.0,0.38
barents-kara_seas,0.1,0.0,0.89,0.0,0.5
beaufort_sea,0.0,0.0,0.57,0.0,0.56
chukchi-east_siberian_sea,0.1,0.0,0.89,0.0,0.5
greenland_sea,0.0,0.0,0.86,0.0,0.33
