# Get path to rawcolor images
## 09/11/22

In [1]:
import os
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import notebook
from shutil import copy
from datetime import datetime

Point to the location of the processed, unannotated data to be filtered. This process assumes that the processed data has the following path format:

`\abs\path\to\data\processed-[YYYY]-[MM]-[DD]-[hh]-[mm]-[ss].[ms]\webapp\low_mag_cam_rois.js`

Load the image ids from annotated image library

In [2]:
ptfs = glob.glob(os.path.join(r'D:\spc_annotations_071122', '*'))
ptfs = [line for line in ptfs if os.path.isdir(line)]

# iterate over the morphological features file
out = []
for ptf in ptfs:
    tmp = pd.read_csv(glob.glob(os.path.join(ptf, '*features.csv'))[0], index_col=[0])
    tmp['concept'] = os.path.basename(ptf)
    out.append(tmp)

df = pd.concat(out, axis=0)

In [3]:
[line for line in df.index.to_list() if 'low_mag_cam-1620179123922710-6244862924488-31147-000-208-1946-40-28' in line]

['low_mag_cam-1620179123922710-6244862924488-31147-000-208-1946-40-28_rawcolor.jpeg',
 'low_mag_cam-1620179123922710-6244862924488-31147-000-208-1946-40-28_rawcolor.jpeg']

Get the image ids into a consistent format so we can grab all the images for the original source directory. This is probably unnessary but will ensure that all the annotate ROIs have an identical format and enable us to identify the original deployment the annotations came from.

In [4]:
new_ind = [f"{os.path.splitext(line)[0].split('_rawcolor')[0]}.jpeg" for line in df.index.to_list()]
new_ind = [f"{line.split(' (1)')[0]}.jpeg" if ' (1)' in line else line for line in new_ind]

df['img_id'] = new_ind

In [5]:
df.set_index('img_id', inplace=True)

Get a list of all `rawcolor` ROIs from the processed files. We can get this information from the index generated by [SPCconvert](https://github.com/guatek/Dual-Mag-Data).

In [6]:
procd = glob.glob(os.path.join(r'F:\SPC_on_SSD_1\Processed\For Eric', '*\processed*\webapp\low_mag_cam_rois.js'))
procd.extend(glob.glob(os.path.join(r'D:\SPC-proccessed-unannotated', 'processed*\webapp\low_mag_cam_rois.js')))

Loop over all the processed directories (those that have been run through SPCconvert) and read in ROI information from the Javascript database variable created for the webviewer. Use that list to compare against the annotation files and find the path to the `rawcolor` images.

In [7]:
out = []
for item in notebook.tqdm(procd):

    # read in the javascript file from the directory
    with open(item, 'r') as ff:
        tmp = list(ff)
        ff.close()

    # ignore the first two lines and strip newline regex
    tmp = tmp[2::]
    tmp = [line.strip() for line in tmp]

    # split the long string into a dictionary of lists. Each entry represents a single image
    idx = [ix+1 for ix, val in enumerate(tmp) if val == '{']  # find index of open curly braket that indicates new entry
    js_dict = [tmp[ii:ii+10] for ii in idx]  # use that to make a dictionary

    raw_imgs = {}

    # use the dictionary to extract the absolute path to the file and the name of the data collection run
    for xx in js_dict:
        img_id = xx[7].split(': ')[1].split('/')[-1][:-2]
        
        abs_path =  os.path.join(
            os.path.split(item)[0],
            'low_mag_cam_rois',
            xx[7].split(': ')[1].split('/')[1], 
            f'{os.path.splitext(img_id)[0]}_rawcolor.jpeg'
        )

        if 'D:' in item:
            dep = item.split('\\')[2]
        elif 'F:' in item:
            dep = item.split('\\')[5]

        raw_imgs[img_id] = {
            'abspath': abs_path,
            'deployment': dep
        }
    
    # make it into a data frame and save it in the output list
    raw_imgs = pd.DataFrame(raw_imgs)
    raw_imgs = raw_imgs.T
    out.append(raw_imgs[raw_imgs.index.isin(df.index)])

  0%|          | 0/64 [00:00<?, ?it/s]

Consolidate the list of dataframes of image paths to remove runs with no annotations and concationate into a single DataFrame. 

In [8]:
out = [line for line in out if line.count().to_numpy()[0] > 0]
out = pd.concat(out)
out  # view it to check structure

Unnamed: 0,abspath,deployment
low_mag_cam-1619355845046705-164385749272-730-222-66-1448-156-144.jpeg,F:\SPC_on_SSD_1\Processed\For Eric\2021 Apr 25...,processed-2021-04-25-13-01-37.044783560
low_mag_cam-1619355845051241-164385749272-730-224-64-1448-164-120.jpeg,F:\SPC_on_SSD_1\Processed\For Eric\2021 Apr 25...,processed-2021-04-25-13-01-37.044783560
low_mag_cam-1619355845201834-164585731240-731-103-68-1582-48-48.jpeg,F:\SPC_on_SSD_1\Processed\For Eric\2021 Apr 25...,processed-2021-04-25-13-01-37.044783560
low_mag_cam-1619355845202345-164585731240-731-147-1210-1342-100-96.jpeg,F:\SPC_on_SSD_1\Processed\For Eric\2021 Apr 25...,processed-2021-04-25-13-01-37.044783560
low_mag_cam-1619355845206732-164585731240-731-289-2084-310-76-76.jpeg,F:\SPC_on_SSD_1\Processed\For Eric\2021 Apr 25...,processed-2021-04-25-13-01-37.044783560
...,...,...
low_mag_cam-1652110557426321-5678505638760-28313-093-532-1518-196-76.jpeg,D:\SPC-proccessed-unannotated\processed-2022-0...,processed-2022-05-09-14-01-33.040472616
low_mag_cam-1652110557623193-5678705621376-28314-114-460-1468-324-168.jpeg,D:\SPC-proccessed-unannotated\processed-2022-0...,processed-2022-05-09-14-01-33.040472616
low_mag_cam-1652110557823388-5678905604136-28315-150-450-1392-316-168.jpeg,D:\SPC-proccessed-unannotated\processed-2022-0...,processed-2022-05-09-14-01-33.040472616
low_mag_cam-1652110558024554-5679105572216-28316-163-532-1314-244-160.jpeg,D:\SPC-proccessed-unannotated\processed-2022-0...,processed-2022-05-09-14-01-33.040472616


Merge the new information with the data frame of all the measurements.

In [9]:
df = df.join(out)
df  # print out to check structure

Unnamed: 0,clipped_fraction,area,minor_axis_length,major_axis_length,aspect_ratio,orientation,eccentricity,solidity,estimated_volume,timestamp,timestring,concept,abspath,deployment
low_mag_cam-1619355845046705-164385749272-730-222-66-1448-156-144.jpeg,0.0,5362,78.376377,91.684681,0.854847,-0.660084,0.518880,0.906815,2.359151e+06,2021-04-25 16:04:05.046705,2021-04-25T16:04:05.046705,Cnidaria,F:\SPC_on_SSD_1\Processed\For Eric\2021 Apr 25...,processed-2021-04-25-13-01-37.044783560
low_mag_cam-1619355845051241-164385749272-730-224-64-1448-164-120.jpeg,0.0,5399,78.551541,91.573970,0.857793,-0.665965,0.513995,0.911993,2.366846e+06,2021-04-25 16:04:05.051241,2021-04-25T16:04:05.051241,Cnidaria,F:\SPC_on_SSD_1\Processed\For Eric\2021 Apr 25...,processed-2021-04-25-13-01-37.044783560
low_mag_cam-1619355845201834-164585731240-731-103-68-1582-48-48.jpeg,0.0,564,25.977359,27.888513,0.931472,0.667505,0.363814,0.954315,7.883226e+04,2021-04-25 16:04:05.201834,2021-04-25T16:04:05.201834,Small_disk,F:\SPC_on_SSD_1\Processed\For Eric\2021 Apr 25...,processed-2021-04-25-13-01-37.044783560
low_mag_cam-1619355845201834-164585731240-731-103-68-1582-48-48.jpeg,0.0,564,25.977359,27.888513,0.931472,0.667505,0.363814,0.954315,7.883226e+04,2021-04-25 16:04:05.201834,2021-04-25T16:04:05.201834,Spherical-small,F:\SPC_on_SSD_1\Processed\For Eric\2021 Apr 25...,processed-2021-04-25-13-01-37.044783560
low_mag_cam-1619355845202345-164585731240-731-147-1210-1342-100-96.jpeg,0.0,2445,54.608207,57.419533,0.951039,1.067513,0.309071,0.941471,7.172393e+05,2021-04-25 16:04:05.202345,2021-04-25T16:04:05.202345,Small_disk,F:\SPC_on_SSD_1\Processed\For Eric\2021 Apr 25...,processed-2021-04-25-13-01-37.044783560
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
low_mag_cam-1652110557426321-5678505638760-28313-093-532-1518-196-76.jpeg,1.0,14896,87.749644,226.318360,0.387727,1.570796,0.921774,1.000000,7.299601e+06,2022-05-09 18:35:57.426321,2022-05-09T18:35:57.426321,Marine_snow_POC,D:\SPC-proccessed-unannotated\processed-2022-0...,processed-2022-05-09-14-01-33.040472616
low_mag_cam-1652110557623193-5678705621376-28314-114-460-1468-324-168.jpeg,1.0,54432,193.986254,374.121192,0.518512,1.570796,0.855070,1.000000,5.897159e+07,2022-05-09 18:35:57.623193,2022-05-09T18:35:57.623193,Marine_snow_POC,D:\SPC-proccessed-unannotated\processed-2022-0...,processed-2022-05-09-14-01-33.040472616
low_mag_cam-1652110557823388-5678905604136-28315-150-450-1392-316-168.jpeg,1.0,53088,193.986254,364.883543,0.531639,1.570796,0.846971,1.000000,5.751549e+07,2022-05-09 18:35:57.823388,2022-05-09T18:35:57.823388,Marine_snow_POC,D:\SPC-proccessed-unannotated\processed-2022-0...,processed-2022-05-09-14-01-33.040472616
low_mag_cam-1652110558024554-5679105572216-28316-163-532-1314-244-160.jpeg,1.0,39040,184.748478,281.744565,0.655730,1.570796,0.754995,1.000000,4.028152e+07,2022-05-09 18:35:58.024554,2022-05-09T18:35:58.024554,Marine_snow_POC,D:\SPC-proccessed-unannotated\processed-2022-0...,processed-2022-05-09-14-01-33.040472616


Check if any of the entries are missing the absolute path and deployment.

In [15]:
tmp = df[df['abspath'].isna()]
tmp.groupby('concept')['timestamp'].count()

concept
Diatoms_single_cells     1
Nauplii                  2
Tunicates               12
Name: timestamp, dtype: int64

Ok so we're missing a total of 15 ROIs, all in classes that we have a good number of annotations. Just drop them from the dataframe. 

In [17]:
df.drop(tmp.index, inplace=True)

Now check if there are duplicated images in the annotations.

In [37]:
print(f'there are {df[df.index.duplicated()].shape[0]} duplicated image ids')

df[df.index.duplicated()].groupby('concept')['timestamp'].count().sort_values()

there are 2428 duplicated image ids


concept
mollusca                               2
Nauplii                                6
ceratium                               8
Diatoms_spirals                        8
Marine_snow_POC                       10
Cnidaria                              11
Tunicates                             13
Diatoms_round_chains                  14
Plankton_all_other_taxa_combined      19
Small_disk                            21
Diatoms_straight_chains               21
Spiny_triangle                        22
Diatoms_single_cells                  24
copepoda                              25
chaetoceros                           25
Protozoans                            30
Spherical-small                     2169
Name: timestamp, dtype: int64

Most of the dupicates are in *Spherical-small*. Check one of those to see what the duplicate labels are.

In [34]:
df[df.index == 'low_mag_cam-1619360944060282-5263443969728-26228-012-66-1340-36-36.jpeg']

Unnamed: 0,clipped_fraction,area,minor_axis_length,major_axis_length,aspect_ratio,orientation,eccentricity,solidity,estimated_volume,timestamp,timestring,concept,abspath,deployment
low_mag_cam-1619360944060282-5263443969728-26228-012-66-1340-36-36.jpeg,0.0,366,20.273373,23.194526,0.874059,0.164243,0.485821,0.940874,39932.465653,2021-04-25 17:29:04.060282,2021-04-25T17:29:04.060282,Small_disk,F:\SPC_on_SSD_1\Processed\For Eric\2021 Apr 25...,processed-2021-04-25-13-01-37.044783560
low_mag_cam-1619360944060282-5263443969728-26228-012-66-1340-36-36.jpeg,0.0,366,20.273373,23.194526,0.874059,0.164243,0.485821,0.940874,39932.465653,2021-04-25 17:29:04.060282,2021-04-25T17:29:04.060282,Small_disk,F:\SPC_on_SSD_1\Processed\For Eric\2021 Apr 25...,processed-2021-04-25-13-01-37.044783560
low_mag_cam-1619360944060282-5263443969728-26228-012-66-1340-36-36.jpeg,0.0,366,20.273373,23.194526,0.874059,0.164243,0.485821,0.940874,39932.465653,2021-04-25 17:29:04.060282,2021-04-25T17:29:04.060282,Spherical-small,F:\SPC_on_SSD_1\Processed\For Eric\2021 Apr 25...,processed-2021-04-25-13-01-37.044783560
low_mag_cam-1619360944060282-5263443969728-26228-012-66-1340-36-36.jpeg,0.0,366,20.273373,23.194526,0.874059,0.164243,0.485821,0.940874,39932.465653,2021-04-25 17:29:04.060282,2021-04-25T17:29:04.060282,Spherical-small,F:\SPC_on_SSD_1\Processed\For Eric\2021 Apr 25...,processed-2021-04-25-13-01-37.044783560


Ok, so it was labelled as both *Small_disk* and *Spherical_small*. I think the distinction between those classes is pretty nebulous so I'm not going to worry about it too much. 

Now check one of the copepod ones.

In [47]:
df[df.index == 'low_mag_cam-1622527748450383-9530683215456-95140-004-208-1298-60-112.jpeg']

Unnamed: 0,clipped_fraction,area,minor_axis_length,major_axis_length,aspect_ratio,orientation,eccentricity,solidity,estimated_volume,timestamp,timestring,concept,abspath,deployment
low_mag_cam-1622527748450383-9530683215456-95140-004-208-1298-60-112.jpeg,1.0,6720,69.272409,129.321305,0.535661,0.0,0.844433,1.0,2599437.0,2021-06-01 09:09:08.450383,2021-06-01T09:09:08.450383,copepoda,D:\SPC-proccessed-unannotated\processed-2021-0...,processed-2021-06-01-03-30-33.042807160
low_mag_cam-1622527748450383-9530683215456-95140-004-208-1298-60-112.jpeg,1.0,6720,69.272409,129.321305,0.535661,0.0,0.844433,1.0,2599437.0,2021-06-01 09:09:08.450383,2021-06-01T09:09:08.450383,copepoda,D:\SPC-proccessed-unannotated\processed-2021-0...,processed-2021-06-01-03-30-33.042807160


Ok, those seem pretty consitent. So just drop the duplicated indicies and assume that the first entry is correct. 

In [49]:
df = df.loc[~df.index.duplicated(), :]
df.shape

(15697, 14)

Make a data frame with just the concepts, image id, absolute path, and deployment.

In [50]:
abs_path_map = df[['concept', 'abspath', 'deployment']].copy()

In [51]:
abs_path_map

Unnamed: 0,concept,abspath,deployment
low_mag_cam-1619355845046705-164385749272-730-222-66-1448-156-144.jpeg,Cnidaria,F:\SPC_on_SSD_1\Processed\For Eric\2021 Apr 25...,processed-2021-04-25-13-01-37.044783560
low_mag_cam-1619355845051241-164385749272-730-224-64-1448-164-120.jpeg,Cnidaria,F:\SPC_on_SSD_1\Processed\For Eric\2021 Apr 25...,processed-2021-04-25-13-01-37.044783560
low_mag_cam-1619355845201834-164585731240-731-103-68-1582-48-48.jpeg,Small_disk,F:\SPC_on_SSD_1\Processed\For Eric\2021 Apr 25...,processed-2021-04-25-13-01-37.044783560
low_mag_cam-1619355845202345-164585731240-731-147-1210-1342-100-96.jpeg,Small_disk,F:\SPC_on_SSD_1\Processed\For Eric\2021 Apr 25...,processed-2021-04-25-13-01-37.044783560
low_mag_cam-1619355845206732-164585731240-731-289-2084-310-76-76.jpeg,Cnidaria,F:\SPC_on_SSD_1\Processed\For Eric\2021 Apr 25...,processed-2021-04-25-13-01-37.044783560
...,...,...,...
low_mag_cam-1652110557426321-5678505638760-28313-093-532-1518-196-76.jpeg,Marine_snow_POC,D:\SPC-proccessed-unannotated\processed-2022-0...,processed-2022-05-09-14-01-33.040472616
low_mag_cam-1652110557623193-5678705621376-28314-114-460-1468-324-168.jpeg,Marine_snow_POC,D:\SPC-proccessed-unannotated\processed-2022-0...,processed-2022-05-09-14-01-33.040472616
low_mag_cam-1652110557823388-5678905604136-28315-150-450-1392-316-168.jpeg,Marine_snow_POC,D:\SPC-proccessed-unannotated\processed-2022-0...,processed-2022-05-09-14-01-33.040472616
low_mag_cam-1652110558024554-5679105572216-28316-163-532-1314-244-160.jpeg,Marine_snow_POC,D:\SPC-proccessed-unannotated\processed-2022-0...,processed-2022-05-09-14-01-33.040472616


Now save it to the directory where the images will live for safe keeping. 

In [52]:
abs_path_map.to_csv(r'D:\spc_annotations_101122\absolute_path_map.csv')

Now copy all the images to the appropriate subdirectory in the new folder. 

In [67]:
outfile = r'D:\spc_annotations_101112'

probs = []
for item in abs_path_map.index.to_list():
    outptf = os.path.join(outfile, abs_path_map.loc[item]['concept'])
    imgpath = abs_path_map.loc[item]['abspath']
    outbasename = os.path.join(outptf, os.path.basename(imgpath))

    try:
        if not os.path.exists(outbasename):
            if not os.path.exists(outptf):
                os.mkdir(outptf)
            copy(imgpath, outbasename)
    except FileNotFoundError:
        #imgpath = os.path.join(r'D:\spc_annotations_091122', abs_path_map.loc[item]['concept'], os.path.basename(abs_path_map.loc[item]['abspath']))
        #print(f"{imgpath} not found, concept={abs_path_map.loc[item]['concept']}")
        #cnt += 1
        probs.append(item)

Check the missing problems to see what the deal is. 

In [68]:
tmp = abs_path_map[abs_path_map.index.isin(probs)].copy()

In [73]:
print(f'There are {tmp.shape[0]} missing annotations')
print(tmp.groupby('deployment')['abspath'].count())
print(tmp.groupby('concept')['abspath'].count())

There are 104 missing annotations
deployment
processed-2022-05-08-14-01-37.040612712    24
processed-2022-05-08-18-15-35.039450368    21
processed-2022-05-08-22-30-35.033760040    34
processed-2022-05-09-02-45-37.049924536     9
processed-2022-05-09-07-01-35.044419152     4
processed-2022-05-09-11-15-35.040794056     1
processed-2022-05-09-14-01-33.040472616    11
Name: abspath, dtype: int64
concept
Marine_snow_POC                     50
Nauplii                              1
Plankton_all_other_taxa_combined     3
Tunicates                           19
copepoda                            31
Name: abspath, dtype: int64


So still missing 104 annotations from `2022-05-08_Failed-30m_instead_of_meso_65m_02mab`. It looks like the `rawcolor` images were removed from the processed directory. So need to reprocess them. For the moment, just ignore them in the interest of time. 

In [45]:
df[df.index.duplicated()].groupby(['abspath', 'concept']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,clipped_fraction,area,minor_axis_length,major_axis_length,aspect_ratio,orientation,eccentricity,solidity,estimated_volume,timestamp,timestring,deployment
abspath,concept,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
D:\SPC-proccessed-unannotated\processed-2021-06-01-03-30-33.042807160\webapp\low_mag_cam_rois\13521025\low_mag_cam-1622523110617221-4892851071728-48758-005-2144-1654-155-148_rawcolor.jpeg,Protozoans,1,1,1,1,1,1,1,1,1,1,1,1
D:\SPC-proccessed-unannotated\processed-2021-06-01-03-30-33.042807160\webapp\low_mag_cam_rois\13521039\low_mag_cam-1622524709187924-6491424619160-64745-009-1200-1114-40-44_rawcolor.jpeg,Diatoms_single_cells,1,1,1,1,1,1,1,1,1,1,1,1
D:\SPC-proccessed-unannotated\processed-2021-06-01-03-30-33.042807160\webapp\low_mag_cam_rois\13521039\low_mag_cam-1622524787481163-6569718445624-65528-010-168-1044-52-148_rawcolor.jpeg,copepoda,1,1,1,1,1,1,1,1,1,1,1,1
D:\SPC-proccessed-unannotated\processed-2021-06-01-03-30-33.042807160\webapp\low_mag_cam_rois\13521048\low_mag_cam-1622525760511659-7542741341448-75259-004-798-1496-40-44_rawcolor.jpeg,Diatoms_single_cells,1,1,1,1,1,1,1,1,1,1,1,1
D:\SPC-proccessed-unannotated\processed-2021-06-01-03-30-33.042807160\webapp\low_mag_cam_rois\13521064\low_mag_cam-1622527748450383-9530683215456-95140-004-208-1298-60-112_rawcolor.jpeg,copepoda,1,1,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
F:\SPC_on_SSD_1\Processed\For Eric\2021 May 09 reef 26m 0.75mab\processed-2021-05-10-21-01-32.045014688\webapp\low_mag_cam_rois\13505730\low_mag_cam-1620687718802755-7241331395568-36125-004-2034-1482-60-56_rawcolor.jpeg,Spherical-small,1,1,1,1,1,1,1,1,1,1,1,1
F:\SPC_on_SSD_1\Processed\For Eric\2021 May 09 reef 26m 0.75mab\processed-2021-05-11-03-01-31.065448040\webapp\low_mag_cam_rois\13505906\low_mag_cam-1620708801323352-6724434844672-33542-024-428-1230-68-68_rawcolor.jpeg,Spherical-small,1,1,1,1,1,1,1,1,1,1,1,1
F:\SPC_on_SSD_1\Processed\For Eric\2021 May 09 reef 26m 0.75mab\processed-2021-05-11-06-01-30.043811216\webapp\low_mag_cam_rois\13505943\low_mag_cam-1620713162206365-285611389216-1349-010-1560-612-36-32_rawcolor.jpeg,Cnidaria,1,1,1,1,1,1,1,1,1,1,1,1
F:\SPC_on_SSD_1\Processed\For Eric\2021 May 09 reef 26m 0.75mab\processed-2021-05-11-06-01-30.043811216\webapp\low_mag_cam_rois\13505951\low_mag_cam-1620714236486297-1359896567704-6721-012-352-1290-36-36_rawcolor.jpeg,Cnidaria,1,1,1,1,1,1,1,1,1,1,1,1
