# Filtering function
The segmentation routine identifies contiguous regions likely containing ice. We know from experience that it often has flagged regions that are not ice, and can be filtered using clues such as the floe circularity.

The purpose of this notebook is to develop a filtering function with tunable parameters. So far, the main sections here are
1. Compile a list all the test cases with information on (a) whether each stage of IFT completed (b) on whether it's been assigned for floe labeling, and (c) whether the floe labels have been completed.
2. Read in the hdf5 files and true color images
3. Overlay the floe labels on the true color images. These are the "unfiltered" results
4. (TBD) Provide an example of a filtered image, where the floes not passsing the filter settings are masked.

In [111]:
import h5py
import pandas as pd
from os.path import join
from os import listdir
import numpy as np
import rasterio as rio
from rasterio.plot import reshape_as_image
import proplot as pplt

caselist_loc = '../data/validation_tables/qualitative_assessment_tables/all_100km_cases.csv'
eval_table_loc = '../data/ift_data/ift_pipeline_default/eval_tables/'
assign_table_loc = '../data/validation_tables/floe_labeling_tables/'
label_floe_loc = '../data/validation_images/labeled_floes_png/'
image_loc = '../data/validation_images/'

df_cases = pd.read_csv(caselist_loc, index_col=0)

# this is to make it easier to link the filenames and case numbers
case_reference = pd.DataFrame({'case_number': df_cases.loc[df_cases.satellite=='aqua', 'case_number'].values,
                               'location': [x[4:] for x in df_cases.loc[df_cases.satellite=='aqua', 'long_name'].values]})

# this has the results on whether IFT ran to completion on each
df_eval = pd.concat([pd.read_csv(eval_table_loc + file) for file in listdir(eval_table_loc) if '.csv' in file])
df_eval['startdate'] = pd.to_datetime(df_eval['startdate'].values)
df_eval.reset_index(inplace=True, drop=True)
df_eval['region'] = [x.split('-100km')[0] for x in df_eval['location']]

# grabbing the assignments here so I know what folder to look in for un-reviewed floe labels
df_assign = pd.concat([pd.read_csv(assign_table_loc + file, index_col=0) for file in listdir(assign_table_loc) if '.csv' in file]) 
df_assign.reset_index(inplace=True, drop=True)

# this adds the case number to the evaluation files and pulls out the columns we need
df_merged = df_eval.merge(case_reference, left_on='location',
                          right_on='location').loc[:,
                                ['region', 'case_number', 'location',
                                 'startdate', 'enddate', 'preprocess']]
# add filenames for relevant images
df_merged['truecolor_aqua'] = ['_'.join([str(cn).zfill(3), region,
                                         '100km', start_date.strftime('%Y%m%d')]) + \
                                   '.aqua.truecolor.250m.tiff' for cn, region, start_date in zip(
                                        df_merged['case_number'],
                                        df_merged['region'],
                                        df_merged['startdate'])]
df_merged['falsecolor_aqua'] = [x.replace('true', 'false') for x in df_merged['truecolor_aqua']]
df_merged['truecolor_terra'] = [x.replace('aqua', 'terra') for x in df_merged['truecolor_aqua']]
df_merged['falsecolor_terra'] = [x.replace('aqua', 'terra') for x in df_merged['falsecolor_aqua']]

# add info on who was assigned the initial floe labeling
df_merged = df_merged.merge(df_assign.loc[:, ['case_number', 'fl_analyst']], left_on='case_number', right_on='case_number', how='outer')

# check if the floe labeling PNG has been created
labeled_floes = listdir(label_floe_loc)
df_merged['aqua_labeled_floes'] = False
df_merged['terra_labeled_floes'] = False
for idx, row in df_merged.iterrows():
    if pd.isnull(row['fl_analyst']):
        df_merged.loc[idx, 'aqua_labeled_floes'] = np.nan
        df_merged.loc[idx, 'terra_labeled_floes'] = np.nan
    else:
        if row['truecolor_aqua'].split('.')[0].replace('_100km', '') + '_aqua_labeled_floes.png' in labeled_floes:
            df_merged.loc[idx, 'aqua_labeled_floes'] = True
        if row['truecolor_terra'].split('.')[0].replace('_100km', '') + '_terra_labeled_floes.png' in labeled_floes:
            df_merged.loc[idx, 'terra_labeled_floes'] = True
df_merged.set_index('case_number', inplace=True)
df_merged.sort_index(inplace=True)

In [153]:
df_merged.loc[df_merged.aqua_labeled_floes==True, ['region', 'fl_analyst', 'aqua_labeled_floes', 'terra_labeled_floes']]

Unnamed: 0_level_0,region,fl_analyst,aqua_labeled_floes,terra_labeled_floes
case_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
23,baffin_bay,ES,True,True
32,baffin_bay,ES,True,True
33,baffin_bay,ES,True,True
37,baffin_bay,ER,True,True
45,baffin_bay,ES,True,True
50,barents-kara_seas,ER,True,True
81,barents-kara_seas,ES,True,True
92,barents-kara_seas,ES,True,True
107,beaufort_sea,ER,True,True
110,beaufort_sea,ES,True,True


In [197]:
experiment = 'ift_pipeline_minarea_100px'
tc_images = {}
fc_images = {}
floe_properties = {}
labeled_images = {}

for case_number in df_merged.loc[df_merged.preprocess=='pass'].index:
    region = df_merged.loc[case_number, 'region']
    case = df_merged.loc[case_number, 'location']
    dataloc = join('../data/ift_data/', experiment, 'ift_results', region, case)
    files = listdir(join(dataloc, 'preprocess', 'hdf5-files'))
    
    tc_images[case_number] = {}
    fc_images[case_number] = {}
    floe_properties[case_number] = {}
    labeled_images[case_number] = {}
    
    for file in files:
        if 'h5' in file:
            satellite = file.split('.')[1]
            ift_data = h5py.File(join(dataloc, 'preprocess', 'hdf5-files', file))
            floe_properties[case_number][satellite] = pd.DataFrame(
                data=ift_data["floe_properties"]["properties"][:].T, # note the transposition
                columns=ift_data["floe_properties"]["column_names"][:].astype(str))
            labeled_images[case_number][satellite] = ift_data['floe_properties']['labeled_image'][:,:].T # note the transposition
            labeled_images[case_number][satellite] = np.ma.masked_array(
                labeled_images[case_number][satellite],
                mask=labeled_images[case_number][satellite]==0)
        else:
            print(case_number, 'no h5')
    if len(files) == 0:
        print(case_number, 'no files')
    for satellite in ['aqua', 'terra']:
        with rio.open(join(image_loc, 'truecolor', df_merged.loc[case_number, 'truecolor_' + satellite])) as im:
            tc_images[case_number][satellite] = im.read()
        with rio.open(join(image_loc, 'falsecolor', df_merged.loc[case_number, 'falsecolor_' + satellite])) as im:
            fc_images[case_number][satellite] = im.read()

In [199]:
# Overlay the labeled image on the true color image
for case_number in labeled_images:
    if 'terra' in labeled_images[case_number]:
        fig, axs = pplt.subplots(width=8, height=4, ncols=2)
        for ax, satellite in zip(axs, ['aqua', 'terra']):
            ax.imshow(reshape_as_image(tc_images[case_number][satellite]))
            ax.imshow(labeled_images[case_number][satellite] % 10, cmap='tokyo')
            ax.format(ltitle=satellite.title())
        axs.format(suptitle='Case ' + str(case_number) + ' (' + df_merged.loc[case_number, 'region'].replace('_', ' ').title() + ')')
        fig.save('../figures/' + experiment + '/overlaid_segmentation/' + str(case_number).zfill(3) + '_segmentation_results.png', dpi=300)
        pplt.close(fig)

Some very obvious errors that should not happen ever:
- Case 32. This one should've been an easy one for the algorithm -- lots of nice, clear floes. Labeled image contains floes that are inside of the land mask.
- Case 20. Not sure what happened here.
- Case 22. Very large segment, likely a cloud, with many holes in it.
- Case 24. Possible case of the land mask being returned as a floe.
    
