In [1]:
import keras
import numpy as np
import re
from pathlib import Path
from typing import Tuple, List, Union, Dict
from PIL import Image
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.preprocessing import Normalizer
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE


Using TensorFlow backend.


In [2]:
# with help from:
# https://stanford.edu/~shervine/blog/keras-how-to-generate-data-on-the-fly

class XRayLoader:
    
    def __init__(self,
                 data_path: Path,
                 batch_size: int,
                 type_of_set: Union['train', 'valid', 'test'],
                 pneumonia_dict: Dict[str, int],
                 img_shape: Tuple[int, int] = (28, 28),
                 augment_rotation: bool = False,
                 shuffle: bool = False,
                 preload: bool = True):
        
        
        self.data_path: Path = data_path
        self.img_shape: Tuple[int, int] = img_shape
        self.batch_size: int = batch_size
        self.type_of_set: str = type_of_set # e.g. train / valid / test
        self.augment_rotation: bool = augment_rotation
        self.shuffle: bool = shuffle
        self.pneumonia_type_dict: Dict[str, int] = pneumonia_dict
        self.healthy_ids : List[int] = []
        self.pneumonia_ids: List[int] = []
        self.pneumonia_type: List[int] = []
        self.healthy_id_regex: str = 'IM-(.*?)-'
        self.pneumonia_id_regex: str = 'person(.*?)_'
        self.pneumonia_type_regex: str = '_(.*?)_'
        self.preload = preload
        self.x: np.ndarray = None # (samples, img_shape[0], img_shape[1], channels)
            
        self.ids: List[int] = []
            
        self.img_ids: List[int] = []
        self.img_names: List[Path] = []
        self.current_img_id: int = 0
            
    def on_epoch_end(self):
            'Updates indexes after each epoch'
            self.indexes = np.arange(len(self.list_IDs))
            if self.shuffle == True:
                np.random.shuffle(self.indexes)
        
    def generate_ids(self):
        
        self.full_data_path = self.data_path.joinpath(Path(self.type_of_set))
        print(f'Finding classes in: {self.full_data_path}')
        self.classes = [d for d in self.full_data_path.iterdir() if d.is_dir()]
        print(f'Found classes: {[s.name for s in self.classes]}')
        for c in self.classes:
            files = [f for f in c.iterdir() if f.is_file()]
            if c.name.lower() == 'normal':
                self.healthy_ids = list(map(self.__extract_healthy_id, files))
            elif c.name.lower() == 'pneumonia':
                self.pneumonia_ids = list(map(self.__extract_pneumonia_id, files))
                self.pneumonia_type = list(map(self.__extract_pneumonia_type, files))
            self.img_ids += list(range(self.current_img_id, len(files)))
            self.current_img_id += len(files)
            self.img_names += [img_name.relative_to(self.data_path) for img_name in c.iterdir() if img_name.is_file()] 
            if self.preload:
                for f in files:
                    img_arr = np.asarray(Image.open(f))
                    print(img_arr.shape, img_arr.dtype, np.min(img_arr), np.max(img_arr))
                    return
                    
        
        self.ids = self.healthy_ids + self.pneumonia_ids
        self.targets = [0]*len(self.healthy_ids) + self.pneumonia_type
            
    def __extract_healthy_id(self, 
                            p: Path) -> int:
        s = p.name
        return int(re.search(self.healthy_id_regex, s).group(1))
    
    def __extract_pneumonia_id(self, 
                               p: Path) -> int:
        s = p.name
        return int(re.search(self.pneumonia_id_regex, s).group(1))
    
    def __extract_pneumonia_type(self, 
                                 p: Path) -> int:
        s = p.name
        t = str(re.search(self.pneumonia_type_regex, s).group(1))
        return self.pneumonia_type_dict[t]
        
        
    def __len__(self):
        pass
        

In [3]:
d = {'bacteria': 1, 'virus': 2} # normal = 0
x = XRayLoader(Path('/Volumes/SEAGATE/chest_xray'), 
               batch_size=128, 
               type_of_set='train',
               pneumonia_dict=d)
print(x.generate_ids())

Finding classes in: /Volumes/SEAGATE/chest_xray/train
Found classes: ['NORMAL', 'PNEUMONIA']
(1858, 2090) uint8 0 255
None


In [19]:
datagen = keras.preprocessing.image.ImageDataGenerator(data_format='channels_last')
train_generator = datagen.flow_from_directory(
                Path('/Volumes/SEAGATE/chest_xray/train'),
                color_mode='grayscale',
                batch_size=32,
                class_mode=None)
validation_generator = datagen.flow_from_directory(
                Path('/Volumes/SEAGATE/chest_xray/val'),
                color_mode='grayscale',
                batch_size=16,
                class_mode=None)

def random_crop(img, random_crop_size):
    # Note: image_data_format is 'channel_last'
    assert img.shape[2] == 3
    height, width = img.shape[0], img.shape[1]
    dy, dx = random_crop_size
    x = np.random.randint(0, width - dx + 1)
    y = np.random.randint(0, height - dy + 1)
    return img[y:(y+dy), x:(x+dx), :]


def crop_generator(batches, crop_length):
    """Take as input a Keras ImageGen (Iterator) and generate random
    crops from the image batches generated by the original iterator.
    """
    while True:
        batch_x, batch_y = next(batches)
        batch_crops = np.zeros((batch_x.shape[0], crop_length, crop_length, 3))
        for i in range(batch_x.shape[0]):
            batch_crops[i] = random_crop(batch_x[i], (crop_length, crop_length))
        yield (batch_crops, batch_y)
        
train_generator = crop_generator(train_generator, 128)
validation_generator = crop_generator(validation_generator, 128)

Found 5216 images belonging to 2 classes.
Found 16 images belonging to 2 classes.


In [20]:
tg = zip(train_generator, train_generator)
vg = zip(validation_generator, validation_generator)

class L2Layer(keras.layers.Layer):
    
    def __init__(self, **kwargs):
        super(L2Layer, self).__init__(**kwargs)
        
    def call(self, x):
        return tf.keras.backend.l2_normalize(x, axis=0)
        
    

In [21]:
model = keras.models.Sequential()
model.add(keras.layers.Conv2D(filters=64, kernel_size=5, strides=2, input_shape=(68, 68, 1), activation='relu', padding='same'))
model.add(keras.layers.Lambda(lambda x: tf.keras.backend.l2_normalize(x, axis=0)))
model.add(keras.layers.Conv2D(filters=128, kernel_size=5, strides=2, activation='relu', padding='same'))
model.add(keras.layers.Lambda(lambda x: tf.keras.backend.l2_normalize(x, axis=0)))
model.add(keras.layers.Conv2D(filters=256, kernel_size=3, strides=2, activation='relu', padding='valid'))
model.add(keras.layers.Lambda(lambda x: tf.keras.backend.l2_normalize(x, axis=0)))
model.add(keras.layers.Flatten())
model.add(keras.layers.Dense(units=128))
model.add(keras.layers.Lambda(lambda x: tf.keras.backend.l2_normalize(x, axis=0), name='embeddings'))
model.add(keras.layers.Dense(units=16384))
#model.add(keras.layers.Lambda(lambda x: tf.keras.backend.l2_normalize(x, axis=0)))
model.add(keras.layers.Reshape( (8, 8, 256) ))
model.add(keras.layers.Conv2DTranspose(filters=128, kernel_size=3, strides=2, padding='valid', activation='relu', name='tp1'))
#model.add(keras.layers.Lambda(lambda x: tf.keras.backend.l2_normalize(x, axis=0)))
model.add(keras.layers.Conv2DTranspose(filters=64, kernel_size=5, strides=2, padding='same', activation='relu', name='tp2'))
#model.add(keras.layers.Lambda(lambda x: tf.keras.backend.l2_normalize(x, axis=0)))
model.add(keras.layers.Conv2DTranspose(filters=1, kernel_size=5, strides=2, padding='same', activation='relu', name='tp3'))
model.compile(optimizer='adam',
                  loss='mse')
model.summary()

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_28 (Conv2D)           (None, 34, 34, 64)        1664      
_________________________________________________________________
lambda_28 (Lambda)           (None, 34, 34, 64)        0         
_________________________________________________________________
conv2d_29 (Conv2D)           (None, 17, 17, 128)       204928    
_________________________________________________________________
lambda_29 (Lambda)           (None, 17, 17, 128)       0         
_________________________________________________________________
conv2d_30 (Conv2D)           (None, 8, 8, 256)         295168    
_________________________________________________________________
lambda_30 (Lambda)           (None, 8, 8, 256)         0         
_________________________________________________________________
flatten_10 (Flatten)         (None, 16384)           

In [22]:
model.fit_generator(
         generator=tg,  
         epochs=50,
         steps_per_epoch=163,
         validation_data = vg,
         validation_steps=1,
         verbose=1,
         validation_freq=1,
         use_multiprocessing=True,
         shuffle=True)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50

OSError: [Errno 6] Device not configured

In [None]:
test_generator = datagen.flow_from_directory(
                Path('/Volumes/SEAGATE/chest_xray/test'),
                color_mode='grayscale',
                target_size=(28, 28),
                batch_size=16,
                class_mode='binary')
encoder = keras.Model(inputs=model.input, outputs=model.get_layer(name='lambda_8').output)
embeddings = encoder.predict_generator(train_generator)

print(f'Embeddings shape: {embeddings.shape}')
clusters = KMeans(n_clusters=3).fit_transform(embeddings)
print(f'Clusters shape: {clusters.shape}')

def plot_tsne(data: np.ndarray,
              labels: np.ndarray,
              dataset_name: np.ndarray,
              num_samples_per_class: int = 100) -> None:
    unique_labels = np.unique(labels)
    x_tsne = np.array([])
    for i in range(len(unique_labels)):
        label_ix = np.argwhere(unique_labels[i] == labels).flatten()
        random_labels_ix = np.random.choice(label_ix, num_samples_per_class)
        random_label_data = data[random_labels_ix, :]
        print(np.mean(random_label_data), np.std(random_label_data))
        random_label_data = np.reshape(random_label_data, (num_samples_per_class, -1))
        if i == 0:
            x_tsne = random_label_data
        else:
            x_tsne = np.vstack((x_tsne, random_label_data))
    tsne_emb = TSNE(n_components=2).fit_transform(x_tsne)
    i = 0
    for i in range(len(unique_labels)):
        rgb = np.random.rand(3,)
        start_slice = (i)*num_samples_per_class
        stop_slice = (i+1)*num_samples_per_class
        x_plot = tsne_emb[()]
        if i == 0:
            plt.scatter(tsne_emb[start_slice:stop_slice, 0], tsne_emb[start_slice:stop_slice,1], c='b', s=10, alpha=0.5)
        else:
            plt.scatter(tsne_emb[start_slice:stop_slice, 0], tsne_emb[start_slice:stop_slice,1], c='r', s=10, alpha=0.5)
    plt.title(f'T-SNE features {dataset_name}')
    plt.legend(unique_labels)
    plt.show()

In [None]:
plot_tsne(embeddings,
          test_generator.classes,
          'Xray dataset',
          num_samples_per_class=300)

In [None]:
plot_tsne(clusters,
          test_generator.classes,
          'Xray dataset',
          num_samples_per_class=300)