# Generate XML annotation files and JPEGImages from TIFF files.

* This notebook assumes that datasets have been stored locally on this machine and bounding regions annotated within Fiji/ImageJ.
* If this is not the case, then you either want to maybe use the OMERO version of this script:
dset01_create_anno_from_OMERO.ipynb  (This other notebook allows you to import ROI from tiff files directly).  
* Creating a dataset forms the foundation of training material used to train one of many object detection algorithms.  
* N.B. Once Finished here it is also important to add the information about this dataset to the config/dataset_spec.txt file.  
* N.B. The next step after this is to run is to run the notebook dset02_create_anno_formats. This next notebook is used  create the  
configuration files associated with the dataset and also allows you to group datasets together for larger training.

In [1]:
import getpass
import matplotlib.pylab as plt
import os
import numpy as np
from scipy import ndimage
import ijroi
from ijroi.ij_roi import Roi
from ijroi.ijpython_decoder import decode_ij_roi
import sys
sys.path.append('../src')
import convert_voc_to_other as cvto

### Creating Folder structure for new dataset
First we need to create a folder structure on the file-system.

In [11]:
#Create the folder structure.
dataset_home_dir = "/Users/dominicwaithe/Documents/collaborators/WaitheD/micro_vision/cell_datasets/"
dataset_name = "cho_rfp_pcna_class" #e.g. erythroblast_dapi_glycophorinA_FOXO3_class
year_acquisition = "2020"
class_name = "cell - cho rfp pcna" #Classes to have in this dataset.
###########
## The above has to be added to config/dataset_spec.txt file.
###########

xml_path = dataset_home_dir+dataset_name+'/'+year_acquisition+'/Annotations'
jpg_path = dataset_home_dir+dataset_name+'/'+year_acquisition+'/JPEGImages'
# checking whether folder/directory exists
if not os.path.exists(dataset_home_dir+dataset_name):
    os.mkdir(dataset_home_dir+dataset_name)
if not os.path.exists(dataset_home_dir+dataset_name+'/'+year_acquisition+'/'):
    os.mkdir(dataset_home_dir+dataset_name+'/'+year_acquisition+'/')
for dirm in [xml_path,jpg_path]:
    if not os.path.exists(dirm):
        os.mkdir(dirm)


### Specify folder containing the tiff files.
and collect those files with ij_metadata (rather than OME metadata).

In [12]:

from tifffile import TiffFile


dir_to_read = "/Users/dominicwaithe/Documents/collaborators/WaitheD/micro_vision/acquisitions/2020_11_13_training_imgs/2020_11_13_training_imgs_highDenFixed/"
myimages = [] #list of image filenames
dirFiles = os.listdir(dir_to_read) #list of directory files
dirFiles.sort() #good initial sort but doesnt sort numerically very well
sorted(dirFiles) #sort numerically in ascending order

for file in dirFiles: #filter out all non jpgs
    if '.tiff' in file[-5:] or '.tif' in file[-4:]:
        myimages.append(file)
print (len(myimages))
print (myimages)
ij_tiff =[]
ome_tiff = []
for tiff in myimages:
    img_to_open = dir_to_read+tiff
    tf_img = TiffFile(img_to_open)
    
    if tf_img.ome_metadata !=None:
        print('I think this is an OME tiff.')
        ome_tiff.append(tiff)
    elif tf_img.imagej_metadata !=None:
        print('I think this is an IJ tiff.')
        ij_tiff.append(tiff)
        
    tf_img.close()

103
['pos00001.tif', 'pos00002.tif', 'pos00003.tif', 'pos00004.tif', 'pos00005.tif', 'pos00006.tif', 'pos00007.tif', 'pos00008.tif', 'pos00009.tif', 'pos00010.tif', 'pos00011.tif', 'pos00012.tif', 'pos00013.tif', 'pos00014.tif', 'pos00015.tif', 'pos00016.tif', 'pos00017.tif', 'pos00018.tif', 'pos00019.tif', 'pos00020.tif', 'pos00021.tif', 'pos00022.tif', 'pos00023.tif', 'pos00024.tif', 'pos00025.tif', 'pos00026.tif', 'pos00027.tif', 'pos00028.tif', 'pos00029.tif', 'pos00030.tif', 'pos00031.tif', 'pos00032.tif', 'pos00033.tif', 'pos00034.tif', 'pos00035.tif', 'pos00036.tif', 'pos00037.tif', 'pos00038.tif', 'pos00039.tif', 'pos00040.tif', 'pos00041.tif', 'pos00042.tif', 'pos00043.tif', 'pos00044.tif', 'pos00045.tif', 'pos00046.tif', 'pos00047.tif', 'pos00048.tif', 'pos00049.tif', 'pos00050.tif', 'pos00051.tif', 'pos00052.tif', 'pos00053.tif', 'pos00054.tif', 'pos00055.tif', 'pos00056.tif', 'pos00057.tif', 'pos00058.tif', 'pos00059.tif', 'pos00060.tif', 'pos00061.tif', 'pos00062.tif', 'po

### Generate XML annotation files and JPEGImages from ImageJ Tifffiles.
This cell takes a folder of ImageJ tiff images and annotations located on local computer.
This script assumes that images have been annotated in fiji/ImageJ and the cell class has been used to label the ROI.


In [13]:
#If you want to name your output images with a sequence (recommended), then set rename_with_seq=True
rename_with_seq = True
start_index = 2000 #Choose a sensible unique start.
#If rename_with_seq = False, then the input name will be reused, with .jpg ending.
annotator_name = "Waithe" 
override = True 
scale_factor = 0.5
ch_to_take = 2


ct = 0
for tiff in ij_tiff:
    if rename_with_seq:
        output_name = str(start_index+ct).zfill(6)
    else:
        output_name = tiff.split('.')[:-1][0]
    
    print('input name:',tiff,'output name: '+str(output_name)+'.jpg')
    tfile = TiffFile(dir_to_read+tiff)
    img_shape = tfile.asarray().shape

    overlay_arr = []
    if 'Overlays' in tfile.imagej_metadata:
        overlays = tfile.imagej_metadata['Overlays']
        if overlays.__class__.__name__ == 'list':
            #Multiple overlays and so iterate.
            for overlay in overlays:

                overlay_arr.append(decode_ij_roi(overlay,img_shape))
        else:
            #One overlay.
                overlay_arr.append(decode_ij_roi(overlays,img_shape))
    else:
        print('no Overlays present in file.')
    
    roi_list = []
    for i in range(0,overlay_arr.__len__()):
        if overlay_arr[i]:
            roi_class_name = str(overlay_arr[i].name).replace("\x00", "")
            x = overlay_arr[i].x
            y = overlay_arr[i].y
            width = overlay_arr[i].width
            height = overlay_arr[i].height
            roi_list.append([x, y, width, height, roi_class_name])
            
    
    raw_img = tfile.asarray()
    if raw_img.shape.__len__() > 2:
        raw_img = raw_img[ch_to_take,:,:]
    print(raw_img.shape)
    
    assert raw_img.shape.__len__() == 2, "image should only have 2-dimensions."
    
    
    sorted_img = np.sort(raw_img.flatten())
    sat_fac = 0.3 #Matches Fiji/ImageJ saturation factor of 0.3%
    img_min = int(np.ceil(sorted_img.shape[0]*((sat_fac/2.)/100.)))
    img_max = int(np.floor(sorted_img.shape[0]*((100.-(sat_fac/2.))/100.)))

    lower_bound = sorted_img[img_min]
    upper_bound = sorted_img[img_max]

    #This is very similar to the ImageJ/Fiji methodoloy when saving JPEGs but isn't exactly the same.
    lut = np.concatenate([
            np.zeros(lower_bound, dtype=np.uint16),
            np.linspace(0, 255, upper_bound - lower_bound).astype(np.uint16),
            np.ones(2**16 - upper_bound, dtype=np.uint16) * 255
        ])

    bit_img = lut[raw_img].astype(np.uint8)
    corr_img = ndimage.interpolation.zoom(bit_img,scale_factor)

    out_img = np.zeros((corr_img.shape[0],corr_img.shape[1],3))
    out_img[:,:,0] = corr_img
    out_img[:,:,1] = corr_img
    out_img[:,:,2] = corr_img

    #assert raw_img.shape[0] == 1024, "input image is unexpected size"
    #assert out_img.shape[0] == 512, "output image is unexpected size"
    
    out_img = out_img.astype(np.uint8)
    jpg_file = str(output_name)+".jpg"
    #Save the JPEG image out to the folder
    plt.imsave(jpg_path+'/'+jpg_file, out_img)
    #Save the XML annotation out.
    cvto.write_xml(xml_path, roi_list, output_name, dataset_name, class_name,override, year_acquisition, out_img.shape[1], out_img.shape[0], scale_factor)
    ct+=1

input name: pos00001.tif output name: 002000.jpg
(1024, 1024)
input name: pos00002.tif output name: 002001.jpg
(1024, 1024)
input name: pos00003.tif output name: 002002.jpg
(1024, 1024)
input name: pos00004.tif output name: 002003.jpg
(1024, 1024)
input name: pos00005.tif output name: 002004.jpg
(1024, 1024)
input name: pos00006.tif output name: 002005.jpg
(1024, 1024)
input name: pos00007.tif output name: 002006.jpg
(1024, 1024)
input name: pos00008.tif output name: 002007.jpg
(1024, 1024)
input name: pos00009.tif output name: 002008.jpg
(1024, 1024)
input name: pos00010.tif output name: 002009.jpg
(1024, 1024)
input name: pos00011.tif output name: 002010.jpg
(1024, 1024)
input name: pos00012.tif output name: 002011.jpg
(1024, 1024)
input name: pos00013.tif output name: 002012.jpg
(1024, 1024)
input name: pos00014.tif output name: 002013.jpg
(1024, 1024)
input name: pos00015.tif output name: 002014.jpg
(1024, 1024)
input name: pos00016.tif output name: 002015.jpg
(1024, 1024)
input na