## Data aquisition

In [1]:
import os
import random
import shutil
from os import listdir
from os.path import isfile, join

In [2]:
import imageio
import skimage
import skimage.io
import skimage.transform
import numpy as np
from scipy import ndarray
from sklearn.model_selection import train_test_split

In [8]:
os.getcwd()


'/home/phil/unimib/tesi/notebooks'

In [3]:
CT = '../data/CT/'
MRI = '../data/MRI/'
PET = '../data/PET/'

myPaths = [CT, MRI, PET]
myDict = {CT: [], MRI: [], PET: []}

# images file names dictionary
for path in myPaths:
    myDict[path] = [f for f in listdir(path) if isfile(join(path, f))]

### Split data in train, validation and test dataset

In [51]:
# 0.7, 0.2 senza augmentation
# 0.4, 0.4 con augmentation (j=2)
# 0.3, 0.4 con j=3
j = 2  # scale of augmentation
train_size = 0.5
val_size = 0.7

# split train test
CT_train, CT_test = train_test_split(myDict[myPaths[0]], train_size=train_size)
MRI_train, MRI_test = train_test_split(
    myDict[myPaths[1]], train_size=train_size)
PET_train, PET_test = train_test_split(
    myDict[myPaths[2]], train_size=train_size)

# split train validation
CT_val, CT_test = train_test_split(CT_test, train_size=val_size)
MRI_val, MRI_test = train_test_split(MRI_test, train_size=val_size)
PET_val, PET_test = train_test_split(PET_test, train_size=val_size)

# create list of file names
train_file_names = CT_train + MRI_train + PET_train
val_file_names = CT_val + MRI_val + PET_val
test_file_names = CT_test + MRI_test + PET_test

# print dimensions datasets
lenTot = len(train_file_names)+len(val_file_names)+len(test_file_names)
percTrain = (len(train_file_names)/lenTot)*100
percVal = (len(val_file_names)/lenTot)*100
percTest = (len(test_file_names)/lenTot)*100
print('Number of train images:', len(
    train_file_names), '= %.0f' % percTrain, '%')
print('Number of validation images:', len(
    val_file_names), '= %.0f' % percVal, '%')
print('Number of test images:', len(test_file_names), '= %.0f' % percTest, '%')

Number of train images: 2124 = 50 %
Number of validation images: 1486 = 35 %
Number of test images: 638 = 15 %


### Make data/ and copy images in train/, val/ and test/

In [52]:
print(os.getcwd())

# delete existing data/
data_dir = 'data/'
if os.path.exists(data_dir):
    print('Deleting', data_dir)
    try:
        shutil.rmtree(data_dir)
    except:
        raise
print('Delete complete')

/home/phil/unimib/tesi/src
Deleting data/
Delete complete


In [None]:
# make data/
print('Creating', data_dir, 'here:', os.getcwd())

In [53]:
directories = ['data/train/',
               'data/val/',
               'data/test/']

# make directories
for directory in directories:
    os.makedirs(directory)

Creating data/ here: /home/phil/unimib/tesi/src


In [54]:
#copy images in relative dir
_file_names = [train_file_names, val_file_names, test_file_names]


def copyImages(srcs, _file_names, directory):
    for source in srcs:
        for f in _file_names:
            try:
                shutil.copy(source+f, directory)
            except OSError:
                pass
    print('Copied successfully.')


for name, directory in zip(_file_names, directories):
    copyImages(myPaths, name, directory)

Copied successfully.
Copied successfully.
Copied successfully.


In [55]:
# check
struct = ['Train', 'Validation', 'Test']
for DIR, name, s in zip(directories, _file_names, struct):
    a = len([name for name in os.listdir(DIR)
             if os.path.isfile(os.path.join(DIR, name))])
    if a != len(name):
        print('ERROR, i numberi non combaciano in ', s)
    else:
        print(s, 'OK.')

Train OK.
Validation OK.
Test OK.


## Data augmentation on train

In [56]:
def random_rotation(image_array: ndarray):
    # pick a random degree of rotation between 25% on the left and 25% on the right
    random_degree = random.uniform(-25, 25)
    return sk.transform.rotate(image_array, random_degree)


def horizontal_flip(image_array: ndarray):
    # horizontal flip doesn't need skimage, it's easy as flipping the image array of pixels !
    return image_array[:, ::-1]


def vertical_flip(image_array: ndarray):
    # vertical flip
    return image_array[::-1, :]

In [57]:
from tqdm import tqdm

# our folder path containing some images
folder_path = 'data/train'

# the number of file to generate
num_files_desired = (len(train_file_names))*j

# loop on all files of the folder and build a list of files paths
images = [os.path.join(folder_path, f) for f in os.listdir(
    folder_path) if os.path.isfile(os.path.join(folder_path, f))]

available_transformations = {
    'rotate': random_rotation,
    'horizontal_flip': horizontal_flip,
    'vertical_flip': vertical_flip
}

for i in tqdm(range(num_files_desired)):
    # random image from the folder
    image_path = random.choice(images)
    # read image as an two dimensional array of pixels
    image_to_transform = skimage.io.imread(image_path)

    # random num of transformations to apply
    num_transformations_to_apply = random.randint(
        1, len(available_transformations))

    # choose a random transformation to apply for a single image
    key = random.choice(list(available_transformations))
    transformed_image = available_transformations[key](image_to_transform)

    # define a name for our new file
    new_file_path = '%s_augmented_image_%s.png' % (image_path.split('.')[0], i)
    
    #img_uint8 = transformed_image.astype(np.uint8)
    
    imageio.imwrite(new_file_path, transformed_image)
    
    # write image to the disk
    #sk.io.imsave(new_file_path, transformed_image)

    #from IPython.core.debugger import set_trace
    # set_trace()















































100%|██████████| 4248/4248 [00:46<00:00, 91.11it/s]


In [58]:
train_file_names = [f for f in listdir(
    directories[0]) if isfile(join(directories[0], f))]

# print dimensions datasets
lenTot = len(train_file_names)+len(val_file_names)+len(test_file_names)
percTrain = (len(train_file_names)/lenTot)*100
percVal = (len(val_file_names)/lenTot)*100
percTest = (len(test_file_names)/lenTot)*100
print('Number of train images:', len(
    train_file_names), '= %.0f' % percTrain, '%')
print('Number of validation images:', len(
    val_file_names), '= %.0f' % percVal, '%')
print('Number of test images:', len(test_file_names), '= %.0f' % percTest, '%')

Number of train images: 6372 = 75 %
Number of validation images: 1486 = 17 %
Number of test images: 638 = 8 %
