In [1]:
%load_ext autoreload
%autoreload 2

In [11]:
from Setup.config import config
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from Tools.leica_tools import RawLoader
from Tools.db_tools import DbManager
from functools import partial
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
from keras.models import Model
from keras.layers import Input, Conv2D, Flatten, Dense, MaxPooling2D, Dropout, Layer
from keras.losses import MeanSquaredError, CategoricalCrossentropy, BinaryCrossentropy
from keras.metrics import CategoricalAccuracy, BinaryAccuracy
from keras.optimizers import Adam
from keras.utils import plot_model

# KMeans detection

In [3]:
expID = 'NKIP_FA_056'

In [90]:
rawloader = RawLoader(expID)
dbm = DbManager()
drop_register = rawloader.get_dropregister()

In [91]:
ds, spec = dbm.get_dataset(expID, return_spec=True)
bins = 256
histograms = np.zeros((spec['n_frames'], bins))
globalIDs = np.zeros(spec['n_frames'])
for i, element in enumerate(ds.as_numpy_iterator()):
    globalIDs[i] = element['GlobalID']
    frame = element['frame']
    hist, bins = np.histogram(frame[:, :, 0].flatten(), bins=bins, range=(0, 65535), density=True)
    histograms[i, :] = hist / np.sum(hist)

2024-05-29 13:41:14.576346: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [95]:
n_clusters = 10
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
clusters = kmeans.fit_predict(histograms)
cluster_labels, counts = np.unique(clusters, return_counts=True)
for l, c in zip(cluster_labels, counts):
    print(f'Group {l} labeled with {c} droplets')
cluster_df =  pd.DataFrame(clusters, index=pd.Index(globalIDs, name='GlobalID').astype(int), columns=['cluster'])

Group 0 labeled with 9003 droplets
Group 1 labeled with 10315 droplets
Group 2 labeled with 1446 droplets
Group 3 labeled with 10000 droplets
Group 4 labeled with 15651 droplets
Group 5 labeled with 15695 droplets
Group 6 labeled with 19130 droplets
Group 7 labeled with 4130 droplets
Group 8 labeled with 26219 droplets
Group 9 labeled with 18424 droplets


In [96]:

with PdfPages(os.path.join(rawloader.an_dir, 'KMeans', f'KMeans_{n_clusters}c.pdf')) as pdf:
    for c in cluster_labels:
        fig, axs = plt.subplots(figsize=(4,4), ncols=4, nrows=4)
        IDs = cluster_df.query(f'cluster == {c}').sample(16).index
        frames = dbm.filter_db(expID, IDs)[:, :, :, 0]
        for i, ax in enumerate(axs.flatten()):
            ax.imshow(frames[i]/65535, cmap='gray', vmin=0, vmax=1)
            ax.grid(False)
            ax.set_yticks([])
            ax.set_xticks([])
        fig.suptitle(f'Samples of cluster {c}', fontsize=15)
        pdf.savefig(fig)
        plt.close()

2024-05-29 13:42:14.241974: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2024-05-29 13:42:14.336075: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2024-05-29 13:42:14.427339: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2024-05-29 13:42:14.522692: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2024-05-29 13:42:14.620003: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2024-05-29 13:42:14.775491: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2024-05-29 13:42:14.930706: W tensorflow/core/framework/local_rendezvous.cc:404] L

In [99]:
cluster_df['outlier_KMeans'] = False
cluster_df.loc[cluster_df.query('cluster == 0 | cluster == 2 | cluster == 7').index, 'outlier_KMeans'] = True

In [100]:
rawloader.update_dropregister(drop_register.join(cluster_df))

# CNN Training

In [4]:
dbm = DbManager()

In [5]:
def outlier_class():
    inputs = Input(shape=(128,128,1), name='outlier_input')
    
    # Add convolutional layer
    conv1 = Conv2D(32, (3, 3), activation='relu', name='outlier_conv1')(inputs)
    pool1 = MaxPooling2D(pool_size=(2, 2), name='outlier_pool1')(conv1)
    
    conv2 = Conv2D(64, (3, 3), activation='relu', name='outlier_conv2')(pool1)
    pool2 = MaxPooling2D(pool_size=(2, 2), name='outlier_pool2')(conv2)
    
    conv3 = Conv2D(128, (3, 3), activation='relu', name='outlier_conv3')(pool2)
    pool3 = MaxPooling2D(pool_size=(2, 2), name='outlier_pool3')(conv3)
    
    # Flatten the output from the convolutional layers
    flatten = Flatten(name='outlier_flatten')(pool3)
    
    # Add fully connected layers
    dense1 = Dense(512, activation='relu', name='outlier_dense1')(flatten)
    dropout1 = Dropout(0.5, name='outlier_dropout1')(dense1)  # Dropout rate of 0.5 (50% dropout rate)
    
    dense2 = Dense(256, activation='relu', name='outlier_dense2')(dropout1)
    dropout2 = Dropout(0.5, name='outlier_dropout2')(dense2)  # Dropout rate of 0.5 (50% dropout rate)
    
    dense3 = Dense(128, activation='relu', name='outlier_dense3')(dropout2)
    dropout3 = Dropout(0.5, name='outlier_dropout3')(dense3)  # Dropout rate of 0.5 (50% dropout rate)
    
    output = Dense(2, activation='softmax', name='outlier_output')(dropout3)  # Binary classification with sigmoid activation
    
    model = Model(inputs=inputs, outputs=output, name='outlier_model')

    return model

In [6]:
def prepare_data(element, outlier_data):
    globalID = element['GlobalID']
    expID = element['expID']
    element['outlier_input'] = tf.cast(element['frame'][:, :, tf.constant(0)], tf.float32) / 65535
    
    outputs = {}
    label = tf.py_function(lambda x, i: outlier_data.loc[(x.numpy().decode(), i.numpy()), 'outlier_KMeans'], [expID, globalID], tf.int64)
    label.set_shape(())
    label = tf.cast(tf.one_hot(label, 2), tf.int64)
    outputs['outlier_output'] = label
    return element, outputs

def build_dataset(expIDs):
    dataset = dbm.get_datasets(expIDs, shuffle=True)

    outlier_dfs = []
    for expID in expIDs:
        drop_register = RawLoader(expID).get_dropregister()
        outlier_df = drop_register[['outlier_KMeans',]].copy()
        outlier_df.set_index(pd.MultiIndex.from_product([[expID], outlier_df.index]), inplace=True)  
        outlier_dfs.append(outlier_df)
    outlier_df = pd.concat(outlier_dfs)

    annotated_dataset = dataset.map(partial(prepare_data, outlier_data=outlier_df))
    return annotated_dataset

In [7]:
expIDs = ['NKIP_FA_052','NKIP_FA_053', 'NKIP_FA_055', 'NKIP_FA_056']
dataset = build_dataset(expIDs)
validation_dataset = build_dataset(['NKIP_FA_051'])

In [8]:
n_elements = dataset.reduce(tf.constant(0), lambda a,b: a+1).numpy()
n_elements_val = validation_dataset.reduce(tf.constant(0), lambda a,b: a+1).numpy()
print(f'{n_elements} frames in train dataset')
print(f'{n_elements_val} frames in test dataset')

514848 frames in train dataset
126699 frames in test dataset


In [9]:
train_final = dataset.shuffle(15000).repeat(2).batch(32)
test_final = validation_dataset.repeat(2).batch(32)

In [10]:
model = outlier_class()

In [14]:
model_arch = plot_model(model, to_file='outlier_exclusion.png', dpi=100)

In [108]:
model.compile(optimizer=Adam(),
              loss={'outlier_output': BinaryCrossentropy()},
              metrics={'outlier_output': BinaryAccuracy()})

In [109]:
model.fit(train_final, validation_data=test_final, batch_size=32, steps_per_epoch=16089, epochs=2, validation_steps=4701)

Epoch 1/2
[1m16089/16089[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2114s[0m 131ms/step - binary_accuracy: 0.9518 - loss: 0.1296 - val_binary_accuracy: 0.9576 - val_loss: 0.1132
Epoch 2/2
[1m16089/16089[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2094s[0m 130ms/step - binary_accuracy: 0.9727 - loss: 0.0747 - val_binary_accuracy: 0.9652 - val_loss: 0.1034


2024-05-29 14:58:06.311782: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]
  self.gen.throw(value)


<keras.src.callbacks.history.History at 0x30ebe4e30>

In [110]:
model.save(os.path.join(config['MODEL_DIR'], 'outlier_v2.h5'))



In [3]:
# Define a function to normalize your data
def prepare_data(element, outlier_df):
    globalID = element['GlobalID']  # Assuming 'GlobalID' is a tensor
    expID = element['expID']
    frame = tf.cast(element['frame'][:, :, tf.constant(0)], tf.float32) / 65535

    #label = tf.py_function(lambda x: outlier_vec.get(x.numpy()), [globalID], tf.int64)
    label = tf.py_function(lambda x, i: outlier_df.loc[(x.numpy().decode(), i.numpy()), 'outlier_KMeans'], [expID, globalID], tf.int64)
    label.set_shape(())
    label = tf.cast(tf.one_hot(label, 2), tf.int64)
    element['outlier_output'] = label
    element['outlier_input'] = frame
    return element, element


In [7]:
def get_data(expID):
    rawloader = RawLoader(expID)
    drop_register = rawloader.get_dropregister()
    outlier_df = drop_register[['outlier_KMeans',]].copy().reset_index()
    outlier_df['expID'] = expID
    outlier_df.set_index(['expID', 'GlobalID'], inplace=True)
    dataset = dbm.get_dataset(expID)
    return dataset, outlier_df

# Inference

In [348]:
expID = 'AT_2024_007'
rawloader = RawLoader(expID)
drop_register = rawloader.get_dropregister()
label_dict = drop_register['outlier'].to_dict()

In [349]:
dataset = dbm.get_dataset(expID)
normalized_dataset = dataset.map(partial(normalize, label_dict=label_dict)).batch(32)
y_predict_raw = model.predict(normalized_dataset)


[1m3403/3403[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m167s[0m 49ms/step


  self.gen.throw(value)
2024-05-14 12:59:10.189554: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [350]:
globalIDs = [e['GlobalID'] for e in dataset.as_numpy_iterator()]
y_predict = np.argmax(y_predict_raw,axis=1).astype(bool)
drop_register.loc[globalIDs, 'outlier'] = y_predict
rawloader.update_dropregister(drop_register)

2024-05-14 13:01:11.973711: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [352]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

In [354]:
with PdfPages(os.path.join(rawloader.an_dir, 'outlier_summary.pdf')) as pdf:
    for outlier in [True, False]:
        fig, axs = plt.subplots(figsize=(8,8), ncols=8, nrows=8)
        subset = drop_register.query(f'outlier == {outlier}').copy()
        size = subset.index.size
        IDs = subset.sample(min(size,64)).index
        frames = dbm.filter_db(expID, IDs)[:, :, :, 0]
        for i, ax in enumerate(axs.flatten()):
            ax.imshow(frames[i]/65535, cmap='gray', vmin=0, vmax=1)
            ax.grid(False)
            ax.set_yticks([])
            ax.set_xticks([])
        if outlier:
            fig.suptitle(f'Samples of detected outliers ({size} in total)', fontsize=15)
        else:
            fig.suptitle(f'Samples of detected non-outliers ({size} in total)', fontsize=15)
        pdf.savefig(fig)
        plt.close()

2024-05-14 13:06:14.183886: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2024-05-14 13:06:14.304092: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2024-05-14 13:06:14.388476: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2024-05-14 13:06:14.475397: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2024-05-14 13:06:14.560140: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2024-05-14 13:06:14.644242: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2024-05-14 13:06:14.727129: W tensorflow/core/framework/local_rendezvous.cc:404] L