# Prepare data distribution and folders to be used with our DAMT-Net script


In [37]:
#### CHANGE THIS and run all ####

# input data dir
source_path = "./mitochondria/"
# datasets (folder names) in the selected input data dir
datasets = ['Lucchi++', 'VNC', 'Kasthuri++']

# output data dir (where is going to be placed)
destination_path = './datasets/mitochondria/'


Input data distribution:

for each dataset, for example 'VNC' ( data == 'VNC' )

```
    data/
        |-- train/
        |    |-- x/
        |    |      training-0001.tif
        |    |      ...
        |    |-- y/
        |    |      training_groundtruth-0001.tif
        |    |        ...
        |-- test/
        |    |-- x/
        |    |      testing-0001.tif
        |    |      ...
        |    |-- y/
        |    |      testing_groundtruth-0001.tif
        |    |      ...
```

Output data distribution: 

```
    data/
        |-- train/
        |    |-- file_list.txt
        |    |-- x/
        |    |      training-0001.tif
        |    |      ...
        |    |-- y/
        |    |      training_groundtruth-0001.tif
        |    |        ...
        |-- test/
        |    ...
        |-- train_val/
        |    ...
        |-- train_val_test/
        |    ...
        |-- val/
        |    ...

```

## Run all

### funciones

In [38]:
import numpy as np
import os

def set_seed(seedValue=42):
  """Sets the seed on multiple python modules to obtain results as
  reproducible as possible.
  Args:
  seedValue (int, optional): seed value.
  """
  np.random.seed(seed=seedValue)
  os.environ["PYTHONHASHSEED"]=str(seedValue)
set_seed()

In [39]:
from PIL import Image
import os
from skimage.util import img_as_ubyte
from skimage import io
from glob import glob
import numpy as np

def get_xy_image_list(dir):
    if dir[-1]=='/':
        dir = dir[:-1]
    # Paths to the training images and their corresponding labels
    train_input_path = dir + '/x/*.*'
    train_label_path = dir + '/y/*.*'

    # Read the list of file names
    train_input_filenames = glob(train_input_path)
    train_input_filenames.sort()

    train_label_filenames = glob(train_label_path)
    train_label_filenames.sort()

    #print( 'Input images loaded: ' + str( len(train_input_filenames)) )
    #print( 'Label images loaded: ' + str( len(train_label_filenames)) )

    # read training images and labels
    train_img = [ img_as_ubyte( np.array( io.imread( x ), dtype='uint8') ) for x in train_input_filenames ]
    train_lbl = [ img_as_ubyte( np.array( io.imread( x ), dtype='uint8') ) for x in train_label_filenames ]
    filenames = []
    for x in train_label_filenames:
        x = os.path.basename(x)
        name, ext = os.path.splitext(x)
        filenames.append(name)
    
    return train_img, train_lbl, filenames

def save_images(imgs, dst_path, name_prefix, fnames, format='.png', convert=''):
    for i, img in enumerate(imgs):
        im = Image.fromarray(img)
        if convert != '':
            im = im.convert(convert)
        im.save( os.path.join(dst_path, fnames[i] + name_prefix + format), quality=100)

def create_dir(dir):
    if not os.path.exists(dir):
        os.makedirs(dir)

### Create main sets

In [40]:
create_dir(destination_path)

for ds_name in datasets:
    direc = os.path.join(source_path, ds_name)

    print(ds_name)

    print(" train")
    fold = "train"
    train_img, train_lbl, train_fnames = get_xy_image_list(os.path.join(direc,"train"))

    out_img_path = os.path.join(destination_path, ds_name, fold, "x" )
    out_mask_path = os.path.join(destination_path, ds_name, fold, "y" )
    create_dir(out_img_path)
    create_dir(out_mask_path)
    save_images(train_img, out_img_path, ds_name, train_fnames, format='.png')
    save_images(train_lbl, out_mask_path, ds_name, train_fnames, format='.png')

    print(" test")
    fold = "test"
    test_img, test_lbl, test_fnames = get_xy_image_list(os.path.join(direc,"test"))

    out_img_path = os.path.join(destination_path, ds_name, fold, "x")
    out_mask_path = os.path.join(destination_path, ds_name, fold, "y" )
    create_dir(out_img_path)
    create_dir(out_mask_path)
    save_images(test_img, out_img_path, ds_name, test_fnames, format='.png')
    save_images(test_lbl, out_mask_path, ds_name, test_fnames, format='.png')

Lucchi++
 train
 test
VNC
 train
 test
Kasthuri++
 train
 test


### make splits

In [41]:
from glob import glob
from sklearn.model_selection import train_test_split
import shutil
import os
def train_val_split(train_path, val_path, size = 0.1):
    create_dir(os.path.join(val_path,'x'))
    create_dir(os.path.join(val_path,'y'))

    x_set = glob(os.path.join(train_path,"x/*"))
    y_set = glob(os.path.join(train_path,"y/*"))

    x_set.sort()
    y_set.sort()

    assert len(x_set) != 0, "There are no images"  
    assert len(x_set) == len(y_set), "There is different ammount of images and masks x:" + str(len(x_set)) + ", y:" + str(len(y_set))

    indices = [i for i in range(len(x_set))]
    _, val_img_indices = train_test_split(
        indices, 
        test_size=size,
        random_state=42)

    shutil.copytree(train_path, train_path + "_val/")
    
    for i in val_img_indices:
        x = x_set[i]
        y = y_set[i]
        shutil.move(x, os.path.join(val_path, 'x', os.path.basename(x)))
        shutil.move(y, os.path.join(val_path, 'y', os.path.basename(y)))

for ds in datasets:
    print(ds)
    train_val_split(
        os.path.join(destination_path, ds, "train"),
        os.path.join(destination_path, ds, "val"), 
        size=0.1)# size 0.1 == 10%

Lucchi++
VNC
Kasthuri++


In [42]:
from glob import glob
import shutil
import os

def combine_folders(train_path, test_path, out_path):
    shutil.copytree(train_path, out_path)

    x_set = glob(os.path.join(test_path,"x/*"))
    y_set = glob(os.path.join(test_path,"y/*"))
    x_set.sort()
    y_set.sort()
    assert len(x_set) != 0, "There are no images in Test"  
    assert len(x_set) == len(y_set), "There is different ammount of images on test and masks x:" + str(len(x_set)) + ", y:" + str(len(y_set))
    
    for i in range(len(x_set)):
        x = x_set[i]
        y = y_set[i]
        shutil.copy(x, os.path.join(out_path, 'x', 'test_' + os.path.basename(x)))
        shutil.copy(y, os.path.join(out_path, 'y', 'test_' + os.path.basename(y)))

for ds in datasets:
    print(ds)
    combine_folders(
        os.path.join(destination_path, ds, "train_val"),
        os.path.join(destination_path, ds, "test"), 
        os.path.join(destination_path, ds, "train_val_test"))

Lucchi++
VNC
Kasthuri++


In [43]:
from glob import glob
import os

def equal_rename_xy(path):
    x = glob(path+"x/*")
    y = glob(path+"y/*")

    x.sort()
    y.sort()
    
    filenames = []
    for (x_path, y_path) in zip(x,y):
        x_name = os.path.basename(x_path)
        y_dir = os.path.dirname(y_path)
        os.rename(y_path, os.path.join(y_dir, x_name))
        filenames.append(x_name)
    return filenames

dirs = glob(destination_path + "*/*/")
for dir in dirs:
    print(dir)
    names = equal_rename_xy(dir)
    f = open(dir + "file_list.txt", "w")
    #f.write('\n'.join(names) + '\n')
    f.close()

./datasets/mitochondria\Kasthuri++\test\
./datasets/mitochondria\Kasthuri++\train\
./datasets/mitochondria\Kasthuri++\train_val\
./datasets/mitochondria\Kasthuri++\train_val_test\
./datasets/mitochondria\Kasthuri++\val\
./datasets/mitochondria\Lucchi++\test\
./datasets/mitochondria\Lucchi++\train\
./datasets/mitochondria\Lucchi++\train_val\
./datasets/mitochondria\Lucchi++\train_val_test\
./datasets/mitochondria\Lucchi++\val\
./datasets/mitochondria\VNC\test\
./datasets/mitochondria\VNC\train\
./datasets/mitochondria\VNC\train_val\
./datasets/mitochondria\VNC\train_val_test\
./datasets/mitochondria\VNC\val\
