## TFR preprocessing
The goal of this Notebook is to convert all the 3D images that have been preprocessed and saved as numpy files in the previous Notebook (2_MRI_preprocessing) to TFRecords.

According to [Tensorflow](https://www.tensorflow.org/tutorials/load_data/tfrecord), TFRecords are used to store the data as a sequence of binary strings. The main advantage of using TFRecords is that it speeds up data reading.

This notebook is structured as follows:
   - Initial setup
   - Import libraries
   - Load features
   - Load labels
   - Create TFRecords

### Initial set-up

#### Google Drive

In [1]:
# Specify if user is working on Google Drive
google_drive = False

In [2]:
if google_drive == True:
    
    from google.colab import drive 
    drive.mount('/content/drive')
    
    path = "./drive/MyDrive/TFM/Code/"
    
    import sys
    sys.path.append(path)

else:
    path = "../"
    
    import sys
    sys.path.append(path)

### Import libraries

In [3]:
import os
import random
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import tqdm
import tensorflow as tf
import glob

### Load features

#### Load 3D images filenames (features)

In [4]:
# Specify folders where there are the 3D images in numpy’s compressed format (.npz)
root_3d_volumes = path + "Datasets/Volume_files/"

In [5]:
# Get list of filenames from 3D volumes
filenames = os.listdir(root_3d_volumes)

# Remove ".DS_Store" file from list
if ".DS_Store" in filenames:
    filenames.remove(".DS_Store")

# Include all the path for each file
filenames = [path + "Datasets/Volume_files/" + file for file in filenames]

# Check number of images loaded
print("[+] Number of 3D volumes:", len(filenames))

[+] Number of 3D volumes: 1085


In [6]:
# Shuffle list of filenames
random.shuffle(filenames)

#### Split dataset into training, validation and testing

In [7]:
train_size = 0.7
test_size = 0.3

In [8]:
train_filenames, test_filenames = train_test_split(filenames, 
                                               test_size = 0.3, 
                                               random_state = 7)

In [9]:
test_filenames, val_filenames = train_test_split(test_filenames, 
                                             test_size = 0.5, 
                                             random_state = 7)

In [10]:
# Check size of each dataset
print(f"[+] Training size:", len(train_filenames))
print(f"[+] Validation size:", len(val_filenames))
print(f"[+] Testing size:", len(test_filenames))

[+] Training size: 759
[+] Validation size: 163
[+] Testing size: 163


#### Load array for each 3D volume (features)

In [1]:
def return_volumes(filenames):

    # List where to save the 3D volumes
    volumes = []

    # Load 3D images from filenames list
    for file in filenames:

        # Read 3D image
        volume = np.load(file, allow_pickle= True)['arr_0']
        volume = volume[10:120, 30:160, 15:95]

        # Append 3D image to volumes list 
        volumes.append(volume)
        
        if len(volumes) % 100 == 0:
            print(f"{len(volumes)} loaded")
    return np.array(volumes)

In [15]:
# Load training arrays
train_dataset = return_volumes(train_filenames)

100 loaded
200 loaded
300 loaded
400 loaded
500 loaded
600 loaded
700 loaded


In [12]:
# Load validation arrays
val_dataset = return_volumes(val_filenames)

In [13]:
# Load testing arrays
test_dataset = return_volumes(test_filenames)

In [16]:
# Check size of each dataset
print(f"[+] Training shape:", train_dataset.shape)
print(f"[+] Validation shape:", val_dataset.shape)
print(f"[+] Testing shape:", test_dataset.shape)

[+] Training shape: (759, 110, 130, 80)
[+] Validation shape: (163, 110, 130, 80)
[+] Testing shape: (163, 110, 130, 80)


### Load labels

#### Load CSV files with image details: images IDs and class

In [17]:
# Load individuals CSV files with image details
df_1 = pd.read_csv(path + "Datasets/ADNI1_Complete_1Yr_1.5T.csv")
df_2 = pd.read_csv(path + "Datasets/ADNI1_Complete_2Yr_1.5T.csv")
df_3 = pd.read_csv(path + "Datasets/ADNI1_Complete_3Yr_1.5T.csv")

# Concatenate all CSV files in a unique dataframe
df = pd.concat([df_1, df_2, df_3])

# Remove extra whitespaces from column names
df.columns = df.columns.str.replace(" ", "")

df.head()

Unnamed: 0,ImageDataID,Subject,Group,Sex,Age,Visit,Modality,Description,Type,AcqDate,Format,Downloaded
0,I125941,137_S_1426,MCI,M,85,4,MRI,MPR-R; GradWarp; N3; Scaled,Processed,10/30/2008,NiFTI,
1,I121703,128_S_1408,MCI,M,73,4,MRI,MPR; GradWarp; B1 Correction; N3; Scaled,Processed,9/19/2008,NiFTI,
2,I121637,037_S_1421,MCI,F,76,4,MRI,MPR; GradWarp; N3; Scaled,Processed,9/17/2008,NiFTI,
3,I122382,128_S_1407,MCI,F,76,4,MRI,MPR; GradWarp; B1 Correction; N3; Scaled,Processed,9/05/2008,NiFTI,
4,I121689,127_S_1427,MCI,F,71,4,MRI,MPR; GradWarp; B1 Correction; N3; Scaled,Processed,9/02/2008,NiFTI,


In [18]:
# Retrieve only the image ID and Group (class) columns
df = df[["ImageDataID", "Group"]]
df.head()

Unnamed: 0,ImageDataID,Group
0,I125941,MCI
1,I121703,MCI
2,I121637,MCI
3,I122382,MCI
4,I121689,MCI


In [19]:
# Check number of cases by class in the dataframe
df["Group"].value_counts()

MCI    1708
CN     1009
AD      575
Name: Group, dtype: int64

#### Load class for each 3D volume (labels)

In [None]:
def return_labels(filenames, df):

    # List where to save the class for each image: 0 (CN), 1 (AD, MCI)
    labels = []

    # Load labels from filenames list
    for file in filenames:

        # Get image ID from file name
        image_id = file.split("/")[-1].split(".")[0]

        # Retireve class from dataframe searching by image ID
        label = df["Group"].loc[df['ImageDataID'] == image_id].values[0]

        # Assign class numerica values depending the group: 0 (CN) or 1 (AD)
        if label in ["CN"]:
            labels.append([0]) 
        elif label in ["AD"]:
            labels.append([1])  
        else:
            print(f"ERROR with image ID {image_id}")
    
    return np.array(labels)

In [21]:
# Load training labels
train_labels = return_labels(train_filenames, df)

In [22]:
# Load validation labels
val_labels = return_labels(val_filenames, df)

In [23]:
# Load testing labels
test_labels = return_labels(test_filenames, df)

In [24]:
# Check size of each dataset
print(f"[+] Training labels shape:", train_labels.shape)
print(f"[+] Validation labels shape:", val_labels.shape)
print(f"[+] Testing labels shape:", test_labels.shape)

[+] Training labels shape: (759, 1)
[+] Validation labels shape: (163, 1)
[+] Testing labels shape: (163, 1)


### Create TFRecords

https://towardsdatascience.com/a-practical-guide-to-tfrecords-584536bc786c

#### Define functions

In [25]:
def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    
    if isinstance(value, type(tf.constant(0))): # if value ist tensor
        value = value.numpy() # get value of tensor
    
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
    """Returns a floast_list from a float / double."""
    
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def serialize_array(array):
    
    array = tf.io.serialize_tensor(array)
    
    return array

def parse_single_volume(volume, label):
    
    # Get first value of label, as it is an array of length 1
    label = label[0]
    
    # Define the dictionary -- the structure -- of our single example
    data = {'height' : _int64_feature(volume.shape[0]),
            'width' : _int64_feature(volume.shape[1]),
            'depth' : _int64_feature(volume.shape[2]),
            'raw_image' : _bytes_feature(serialize_array(volume)),
            'label' : _int64_feature(label)}
    
    # Create an Example, wrapping the single features
    out = tf.train.Example(features = tf.train.Features(feature = data))

    return out

def write_images_to_tfr(volumes, labels, filename = "images", max_files = 10, out_dir = "../Datasets/"):

    # Determine the number of TFRecords needed
    splits = (len(volumes)//max_files) + 1 
    if len(volumes) % max_files == 0:
        splits-=1   
    
    print(f"[+] Number of TFRecords needed for {len(volumes)} volumes: {splits}")
    print(f"    [-] Number of files per TFRecord: {max_files}")
    
    # Check directory
    if not os.path.exists(out_dir):
        os.mkdir(out_dir)   
    print(f"\n[+] Output directory: {out_dir}\n")
    
    # Write TFRecords
    file_count = 0
    
    for i in tqdm.tqdm(range(splits)):
        
        # Retrieve name of the TFRecord
        tfr_name = "{}{}_{}.tfrecords".format(out_dir, i+1, filename)
        print(f"[+] Writing TFRecord: {tfr_name}")

        # Start writer
        writer = tf.io.TFRecordWriter(tfr_name)
        current_tfr_count = 0
    
        while current_tfr_count < max_files: 
            
            # Get the index of the file that we want to parse now
            index = i * max_files + current_tfr_count
            
            # Check if all dataset has been added to TFRecords
            if index == len(volumes):
                break
                
            # Retrieve volume and label
            current_volume = volumes[index]
            current_label = labels[index]

            # Create the required example representation
            out = parse_single_volume(volume = current_volume, label = current_label)

            writer.write(out.SerializeToString())
            
            # Update counters
            current_tfr_count+=1
            file_count += 1
       
        # Close writer
        writer.close()
    
    print(f"[+] Number of files wrote to TFRecords: {file_count}")

#### Create train TFRecords

In [28]:
write_images_to_tfr(train_dataset, train_labels, 
                    max_files = 30, 
                    filename = "train_volumes",
                    out_dir = "../Datasets/TFRecords/Train/")

  0%|          | 0/26 [00:00<?, ?it/s]

[+] Number of TFRecords needed for 759 volumes: 26
    [-] Number of files per TFRecord: 30

[+] Output directory: ../Datasets/TFRecords/Train/

[+] Writing TFRecord: ../Datasets/TFRecords/Train/1_train_volumes.tfrecords


  4%|▍         | 1/26 [00:01<00:31,  1.25s/it]

[+] Writing TFRecord: ../Datasets/TFRecords/Train/2_train_volumes.tfrecords


  8%|▊         | 2/26 [00:02<00:29,  1.22s/it]

[+] Writing TFRecord: ../Datasets/TFRecords/Train/3_train_volumes.tfrecords


 12%|█▏        | 3/26 [00:03<00:27,  1.21s/it]

[+] Writing TFRecord: ../Datasets/TFRecords/Train/4_train_volumes.tfrecords


 15%|█▌        | 4/26 [00:04<00:26,  1.20s/it]

[+] Writing TFRecord: ../Datasets/TFRecords/Train/5_train_volumes.tfrecords


 19%|█▉        | 5/26 [00:06<00:27,  1.29s/it]

[+] Writing TFRecord: ../Datasets/TFRecords/Train/6_train_volumes.tfrecords


 23%|██▎       | 6/26 [00:07<00:27,  1.38s/it]

[+] Writing TFRecord: ../Datasets/TFRecords/Train/7_train_volumes.tfrecords


 27%|██▋       | 7/26 [00:09<00:28,  1.49s/it]

[+] Writing TFRecord: ../Datasets/TFRecords/Train/8_train_volumes.tfrecords


 31%|███       | 8/26 [00:11<00:26,  1.49s/it]

[+] Writing TFRecord: ../Datasets/TFRecords/Train/9_train_volumes.tfrecords


 35%|███▍      | 9/26 [00:12<00:25,  1.50s/it]

[+] Writing TFRecord: ../Datasets/TFRecords/Train/10_train_volumes.tfrecords


 38%|███▊      | 10/26 [00:14<00:23,  1.50s/it]

[+] Writing TFRecord: ../Datasets/TFRecords/Train/11_train_volumes.tfrecords


 42%|████▏     | 11/26 [00:15<00:21,  1.41s/it]

[+] Writing TFRecord: ../Datasets/TFRecords/Train/12_train_volumes.tfrecords


 46%|████▌     | 12/26 [00:16<00:18,  1.34s/it]

[+] Writing TFRecord: ../Datasets/TFRecords/Train/13_train_volumes.tfrecords


 50%|█████     | 13/26 [00:17<00:17,  1.31s/it]

[+] Writing TFRecord: ../Datasets/TFRecords/Train/14_train_volumes.tfrecords


 54%|█████▍    | 14/26 [00:18<00:15,  1.29s/it]

[+] Writing TFRecord: ../Datasets/TFRecords/Train/15_train_volumes.tfrecords


 58%|█████▊    | 15/26 [00:20<00:14,  1.27s/it]

[+] Writing TFRecord: ../Datasets/TFRecords/Train/16_train_volumes.tfrecords


 62%|██████▏   | 16/26 [00:21<00:12,  1.26s/it]

[+] Writing TFRecord: ../Datasets/TFRecords/Train/17_train_volumes.tfrecords


 65%|██████▌   | 17/26 [00:22<00:11,  1.25s/it]

[+] Writing TFRecord: ../Datasets/TFRecords/Train/18_train_volumes.tfrecords


 69%|██████▉   | 18/26 [00:23<00:09,  1.23s/it]

[+] Writing TFRecord: ../Datasets/TFRecords/Train/19_train_volumes.tfrecords


 73%|███████▎  | 19/26 [00:25<00:08,  1.25s/it]

[+] Writing TFRecord: ../Datasets/TFRecords/Train/20_train_volumes.tfrecords


 77%|███████▋  | 20/26 [00:26<00:07,  1.27s/it]

[+] Writing TFRecord: ../Datasets/TFRecords/Train/21_train_volumes.tfrecords


 81%|████████  | 21/26 [00:27<00:06,  1.25s/it]

[+] Writing TFRecord: ../Datasets/TFRecords/Train/22_train_volumes.tfrecords


 85%|████████▍ | 22/26 [00:28<00:05,  1.26s/it]

[+] Writing TFRecord: ../Datasets/TFRecords/Train/23_train_volumes.tfrecords


 88%|████████▊ | 23/26 [00:30<00:03,  1.28s/it]

[+] Writing TFRecord: ../Datasets/TFRecords/Train/24_train_volumes.tfrecords


 92%|█████████▏| 24/26 [00:31<00:02,  1.25s/it]

[+] Writing TFRecord: ../Datasets/TFRecords/Train/25_train_volumes.tfrecords


 96%|█████████▌| 25/26 [00:32<00:01,  1.26s/it]

[+] Writing TFRecord: ../Datasets/TFRecords/Train/26_train_volumes.tfrecords


100%|██████████| 26/26 [00:33<00:00,  1.28s/it]

[+] Number of files wrote to TFRecords: 759





#### Create validation TFRecords

In [27]:
write_images_to_tfr(val_dataset, val_labels, 
                    max_files = 30, 
                    filename = "val_volumes",
                    out_dir = "../Datasets/TFRecords/Validation/")

  0%|          | 0/6 [00:00<?, ?it/s]

[+] Number of TFRecords needed for 163 volumes: 6
    [-] Number of files per TFRecord: 30

[+] Output directory: ../Datasets/TFRecords/Validation/

[+] Writing TFRecord: ../Datasets/TFRecords/Validation/1_val_volumes.tfrecords


 17%|█▋        | 1/6 [00:01<00:07,  1.57s/it]

[+] Writing TFRecord: ../Datasets/TFRecords/Validation/2_val_volumes.tfrecords


 33%|███▎      | 2/6 [00:02<00:06,  1.51s/it]

[+] Writing TFRecord: ../Datasets/TFRecords/Validation/3_val_volumes.tfrecords


 50%|█████     | 3/6 [00:04<00:04,  1.47s/it]

[+] Writing TFRecord: ../Datasets/TFRecords/Validation/4_val_volumes.tfrecords


 67%|██████▋   | 4/6 [00:06<00:03,  1.57s/it]

[+] Writing TFRecord: ../Datasets/TFRecords/Validation/5_val_volumes.tfrecords


 83%|████████▎ | 5/6 [00:07<00:01,  1.55s/it]

[+] Writing TFRecord: ../Datasets/TFRecords/Validation/6_val_volumes.tfrecords


100%|██████████| 6/6 [00:08<00:00,  1.39s/it]

[+] Number of files wrote to TFRecords: 163





#### Create test TFRecords

In [26]:
write_images_to_tfr(test_dataset, test_labels, 
                    max_files = 30, 
                    filename = "test_volumes",
                    out_dir = "../Datasets/TFRecords/Test/")

  0%|          | 0/6 [00:00<?, ?it/s]

[+] Number of TFRecords needed for 163 volumes: 6
    [-] Number of files per TFRecord: 30

[+] Output directory: ../Datasets/TFRecords/Test/

[+] Writing TFRecord: ../Datasets/TFRecords/Test/1_test_volumes.tfrecords


 17%|█▋        | 1/6 [00:01<00:09,  1.82s/it]

[+] Writing TFRecord: ../Datasets/TFRecords/Test/2_test_volumes.tfrecords


 33%|███▎      | 2/6 [00:03<00:06,  1.73s/it]

[+] Writing TFRecord: ../Datasets/TFRecords/Test/3_test_volumes.tfrecords


 50%|█████     | 3/6 [00:04<00:04,  1.60s/it]

[+] Writing TFRecord: ../Datasets/TFRecords/Test/4_test_volumes.tfrecords


 67%|██████▋   | 4/6 [00:05<00:03,  1.51s/it]

[+] Writing TFRecord: ../Datasets/TFRecords/Test/5_test_volumes.tfrecords


 83%|████████▎ | 5/6 [00:07<00:01,  1.49s/it]

[+] Writing TFRecord: ../Datasets/TFRecords/Test/6_test_volumes.tfrecords


100%|██████████| 6/6 [00:08<00:00,  1.34s/it]

[+] Number of files wrote to TFRecords: 163



