In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import json
from PIL import Image
import pickle
import time

from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

from define_dataset import define_dataset
from define_ext_dataset import define_ext_dataset
from remove_outliers_clustering import find_and_remove_outliers_clustering, remove_outliers_clustering

In [2]:
dataset = define_dataset()
tr_set = dataset.tr_set
test_set = dataset.test_set
results_dict = dataset.results_dict
num_positives = dataset.num_positives
num_negatives = dataset.num_negatives

In [3]:
ext_dataset = define_ext_dataset()

ext_ts_set = ext_dataset.data
ext_results_dict = ext_dataset.results_dict
num_positive_ext = ext_dataset.num_positives
num_negatives_ext = ext_dataset.num_negatives
batch1_ext = ext_dataset.batch1
batch2_ext = ext_dataset.batch2
batch3_ext = ext_dataset.batch3

# Cluster labels subdivision by slide

In [4]:
def subdivide_clusters_by_slide(n_clusters, labels, percentile, dataset='tr'):
    # Labels are still ordered by slide, ie the first 25000 tiles are from the first slide, tile 25001
    # to 70000 is from the second slide and so on
    
    if dataset=='ts':
        num_tiles_dict = num_tiles_dict_ts
    elif dataset=='ext':
        num_tiles_dict = num_tiles_dict_ext
    else:
        num_tiles_dict = num_tiles_dict_tr
    
    # Save path
    slide_cluster_pth = "./clustering/ae/slide_clusters_rem_outliers/kmeans{}_out_perc{}".format(
        n_clusters, percentile)
    if not os.path.exists(slide_cluster_pth):
        os.makedirs(slide_cluster_pth)
    
    # Count tile labels for each slide and save them in same slide folder
    start = 0
    end = 0
    for slide_name in num_tiles_dict:
        slide_num_tiles = num_tiles_dict[slide_name]
        end += slide_num_tiles
        slide_labels = labels[start:end]
        start += slide_num_tiles
        np.save(os.path.join(slide_cluster_pth, slide_name+".npy"), slide_labels)

# Import num tiles dicts

In [5]:
# Dicts with number of tiles for each slide
tiles_num_dict_path = './data/num_tiles_dict_tr.json'
with open(tiles_num_dict_path) as json_file:
    num_tiles_dict_tr = json.load(json_file)
    
tiles_num_dict_path = './data/num_tiles_dict_ts.json'
with open(tiles_num_dict_path) as json_file:
    num_tiles_dict_ts = json.load(json_file)

tiles_num_dict_path = './data/num_tiles_dict_ext.json'
with open(tiles_num_dict_path) as json_file:
    num_tiles_dict_ext = json.load(json_file)

In [6]:
 def get_tile(index, dataset='tr'):
    tiles_path='../WSI/tiles/'
    curr_tiles_sum = 0
    
    if dataset=='ts':
        num_tiles_dict = num_tiles_dict_ts
    elif dataset=='ext':
        num_tiles_dict = num_tiles_dict_ext
    else:
        num_tiles_dict = num_tiles_dict_tr

    for slide_name in num_tiles_dict:
        if index < curr_tiles_sum + num_tiles_dict[slide_name]:
            selected_slide_name = slide_name
            break
        curr_tiles_sum += num_tiles_dict[slide_name]

    tile_idx = index - curr_tiles_sum

    tile_path = os.path.join(tiles_path, selected_slide_name, str(tile_idx) + '.jpg')

    img = Image.open(tile_path)


    return img, selected_slide_name

# Import tile embeddings

In [7]:
epoch = 20
save_model_path = './ae_models/ae/'

embed_path_tr = os.path.join(save_model_path, 'z_ae_test_epoch{}.npy'.format(epoch))
embed_tiles_tr = np.load(embed_path_tr)

embed_path_ts = os.path.join(save_model_path, 'z_ae_internal_test_epoch{}.npy'.format(epoch))
embed_tiles_ts = np.load(embed_path_ts)

embed_path_ext = os.path.join(save_model_path, 'z_ae_ext_test_epoch{}.npy'.format(epoch))
embed_tiles_ext = np.load(embed_path_ext)

# Remove Outliers from clustering and find max in Tr set

In [8]:
n_clusters_arr = [32, 64, 128, 256]
#n_clusters_arr = [32]

In [9]:
perc_arr = [75, 80, 85, 90]

Remove from each cluster the tiles with distance from centroid > selceted percentile

In [10]:
# 3d array of labels. dims=(n_clusters, num_percentiles, num_labels)
labels_arr = []

for i, n_clusters in enumerate(n_clusters_arr):
    kmeans_pth = os.path.join("./clustering", "ae", "kmeans{}".format(n_clusters))
    kmeans = pickle.load(open(kmeans_pth, 'rb'))
    
    labels_arr.append([])

    for percentile in perc_arr:
        # Find max distance for each cluster based on percentile
        clusters_max_dist_arr, new_labels = find_and_remove_outliers_clustering(
            kmeans.labels_, embed_tiles_tr, n_clusters, kmeans.cluster_centers_, percentile) 

        labels_arr[i].append(new_labels)

        # Subdivide all tiles based by original slide
        subdivide_clusters_by_slide(n_clusters, new_labels, percentile)

        # Save max distances for each cluster
        max_dist_cluster_pth = os.path.join("./clustering", "ae", "max_dist_outliers")
        if not os.path.exists(max_dist_cluster_pth):
            os.makedirs(max_dist_cluster_pth)
        np.save(max_dist_cluster_pth+"/max_dist_outliers{}_{}.npy".format(percentile, n_clusters), clusters_max_dist_arr)
        print('Finished {} clusters, percentile {}'.format(n_clusters, percentile))

Finished 32 clusters, percentile 75
Finished 32 clusters, percentile 80
Finished 32 clusters, percentile 85
Finished 32 clusters, percentile 90
Finished 64 clusters, percentile 75
Finished 64 clusters, percentile 80
Finished 64 clusters, percentile 85
Finished 64 clusters, percentile 90
Finished 128 clusters, percentile 75
Finished 128 clusters, percentile 80
Finished 128 clusters, percentile 85
Finished 128 clusters, percentile 90
Finished 256 clusters, percentile 75
Finished 256 clusters, percentile 80
Finished 256 clusters, percentile 85
Finished 256 clusters, percentile 90


Count outlier tiles

In [11]:
for i, n_clusters in enumerate(n_clusters_arr):
    print('KMEANS:', n_clusters)
    for j, percentile in enumerate(perc_arr):
        print('PERC:', percentile)
        labels = labels_arr[i][j]
        print('outliers:', np.count_nonzero(labels == -1))
        print('tot:', len(labels))

KMEANS: 32
PERC: 75
outliers: 332991
tot: 1331953
PERC: 80
outliers: 266398
tot: 1331953
PERC: 85
outliers: 199801
tot: 1331953
PERC: 90
outliers: 133207
tot: 1331953
KMEANS: 64
PERC: 75
outliers: 332995
tot: 1331953
PERC: 80
outliers: 266405
tot: 1331953
PERC: 85
outliers: 199816
tot: 1331953
PERC: 90
outliers: 133220
tot: 1331953
KMEANS: 128
PERC: 75
outliers: 333005
tot: 1331953
PERC: 80
outliers: 266419
tot: 1331953
PERC: 85
outliers: 199837
tot: 1331953
PERC: 90
outliers: 133240
tot: 1331953
KMEANS: 256
PERC: 75
outliers: 333022
tot: 1331953
PERC: 80
outliers: 266446
tot: 1331953
PERC: 85
outliers: 199874
tot: 1331953
PERC: 90
outliers: 133286
tot: 1331953


# Show tiles

In [12]:
num_tiles_per_cluster = 5
cluster_index = 1
perc_index = 0

for n_clusters in n_clusters_arr:
    
    clusters_indexes_arr = []

    for c in range(n_clusters):
        fig, axs = plt.subplots(n_clusters, num_tiles_per_cluster, figsize=(10,2.5*n_clusters))

        
        cluster_index = [i for i, x in enumerate(labels_arr[cluster_index][perc_index]) if x == c]
        random_indexes = np.random.choice(cluster_index, size=num_tiles_per_cluster, replace=False)
        
        for j, rand_idx in enumerate(random_indexes):
            tile, slide_name = get_tile(rand_idx)
            axs[c][j].imshow(tile)
            axs[c][j].axis('off')
            if results_dict[slide_name] == 1:
                positivity = 'pos'
            else:
                positivity = 'neg'
            #axs[c][j].set_title('cluster '+str(c)+" - "+slide_name + ' ' + positivity)
            axs[c][j].set_title('cluster '+str(c)+' - ' + positivity)
    fig.suptitle('Kmeans-{}'.format(n_clusters))
    plt.show()

TypeError: list indices must be integers or slices, not list

In [None]:
n_clusters = 64

kmeans_pth = os.path.join("./clustering", "ae", "kmeans{}".format(n_clusters))
kmeans = pickle.load(open(kmeans_pth, 'rb'))
labels = kmeans.labels_

#unique, counts = np.unique(labels, return_counts=True)
#dict(zip(unique, np.divide(counts*100, len(labels))))

# Internal test set

In [13]:
n_clusters_arr = [32, 64, 128, 256]
#n_clusters_arr = [32]

In [14]:
perc_arr = [75, 80, 85, 90]

In [15]:
# 3d array of labels. dims=(n_clusters, num_percentiles, num_labels)
labels_arr = []

for i, n_clusters in enumerate(n_clusters_arr):
    print('Doing {} clusters'.format(n_clusters))
    
    labels_arr.append([])
    
    # Recover kmeans model
    kmeans_pth = os.path.join("./clustering", "ae", "kmeans{}".format(n_clusters))
    kmeans = pickle.load(open(kmeans_pth, 'rb'))
    
    # Predict labels
    labels = kmeans.predict(embed_tiles_ts)
        
    for percentile in perc_arr:
        # Recover max regarding a percentile
        max_dist_cluster_pth = os.path.join(
            "./clustering", "ae", "max_dist_outliers", "max_dist_outliers{}_{}.npy".format(percentile, n_clusters))
        clusters_max_dist_arr = np.load(max_dist_cluster_pth)

        # Apply removal of tiles with distance over max
        new_labels = remove_outliers_clustering(clusters_max_dist_arr,
            labels, embed_tiles_ts, n_clusters, kmeans.cluster_centers_) 

        labels_arr[i].append(new_labels)
        # Subdivide all tiles based by original slide
        subdivide_clusters_by_slide(n_clusters, new_labels, percentile, dataset='ts')

        print('Finished {} clusters, percentile {}'.format(n_clusters, percentile))

Doing 32 clusters
Finished 32 clusters, percentile 75
Finished 32 clusters, percentile 80
Finished 32 clusters, percentile 85
Finished 32 clusters, percentile 90
Doing 64 clusters
Finished 64 clusters, percentile 75
Finished 64 clusters, percentile 80
Finished 64 clusters, percentile 85
Finished 64 clusters, percentile 90
Doing 128 clusters
Finished 128 clusters, percentile 75
Finished 128 clusters, percentile 80
Finished 128 clusters, percentile 85
Finished 128 clusters, percentile 90
Doing 256 clusters
Finished 256 clusters, percentile 75
Finished 256 clusters, percentile 80
Finished 256 clusters, percentile 85
Finished 256 clusters, percentile 90


In [16]:
for i, n_clusters in enumerate(n_clusters_arr):
    for j, percentile in enumerate(perc_arr):
        labels = labels_arr[i][j]
        print('{} CLUSTERS, PERCENTILE {}'.format(n_clusters, percentile))
        print('outliers:', np.count_nonzero(labels == -1))
        print('tot:', len(labels))

32 CLUSTERS, PERCENTILE 75
outliers: 63384
tot: 207736
32 CLUSTERS, PERCENTILE 80
outliers: 53071
tot: 207736
32 CLUSTERS, PERCENTILE 85
outliers: 42502
tot: 207736
32 CLUSTERS, PERCENTILE 90
outliers: 31328
tot: 207736
64 CLUSTERS, PERCENTILE 75
outliers: 62458
tot: 207736
64 CLUSTERS, PERCENTILE 80
outliers: 52349
tot: 207736
64 CLUSTERS, PERCENTILE 85
outliers: 41929
tot: 207736
64 CLUSTERS, PERCENTILE 90
outliers: 30751
tot: 207736
128 CLUSTERS, PERCENTILE 75
outliers: 62460
tot: 207736
128 CLUSTERS, PERCENTILE 80
outliers: 52215
tot: 207736
128 CLUSTERS, PERCENTILE 85
outliers: 41883
tot: 207736
128 CLUSTERS, PERCENTILE 90
outliers: 30938
tot: 207736
256 CLUSTERS, PERCENTILE 75
outliers: 62322
tot: 207736
256 CLUSTERS, PERCENTILE 80
outliers: 52243
tot: 207736
256 CLUSTERS, PERCENTILE 85
outliers: 41795
tot: 207736
256 CLUSTERS, PERCENTILE 90
outliers: 30954
tot: 207736


# External Test set

In [17]:
n_clusters_arr = [32, 64, 128, 256]
#n_clusters_arr = [32]

In [18]:
perc_arr = [75, 80, 85, 90]

In [19]:
# 3d array of labels. dims=(n_clusters, num_percentiles, num_labels)
labels_arr = []

for i, n_clusters in enumerate(n_clusters_arr):
    print('Doing {} clusters'.format(n_clusters))
    
    # Recover kmeans model
    kmeans_pth = os.path.join("./clustering", "ae", "kmeans{}".format(n_clusters))
    kmeans = pickle.load(open(kmeans_pth, 'rb'))
    
    labels_arr.append([])
    
    # Predict labels
    labels = kmeans.predict(embed_tiles_ext)
    
    for percentile in perc_arr:
        # Recover max regarding a percentile
        max_dist_cluster_pth = os.path.join(
            "./clustering", "ae", "max_dist_outliers", "max_dist_outliers{}_{}.npy".format(percentile, n_clusters))
        clusters_max_dist_arr = np.load(max_dist_cluster_pth)

        # Apply removal of tiles with distance over max
        new_labels = remove_outliers_clustering(clusters_max_dist_arr,
            labels, embed_tiles_ext, n_clusters, kmeans.cluster_centers_) 

        labels_arr[i].append(new_labels)
        # Subdivide all tiles based by original slide
        subdivide_clusters_by_slide(n_clusters, new_labels, percentile, dataset='ext')

        print('Finished {} clusters, percentile {}'.format(n_clusters, percentile))

Doing 32 clusters
Finished 32 clusters, percentile 75
Finished 32 clusters, percentile 80
Finished 32 clusters, percentile 85
Finished 32 clusters, percentile 90
Doing 64 clusters
Finished 64 clusters, percentile 75
Finished 64 clusters, percentile 80
Finished 64 clusters, percentile 85
Finished 64 clusters, percentile 90
Doing 128 clusters
Finished 128 clusters, percentile 75
Finished 128 clusters, percentile 80
Finished 128 clusters, percentile 85
Finished 128 clusters, percentile 90
Doing 256 clusters
Finished 256 clusters, percentile 75
Finished 256 clusters, percentile 80
Finished 256 clusters, percentile 85
Finished 256 clusters, percentile 90


Count outlier tiles

In [20]:
for i, n_clusters in enumerate(n_clusters_arr):
    for j, percentile in enumerate(perc_arr):
        labels = labels_arr[i][j]
        print('{} CLUSTERS, PERCENTILE {}'.format(n_clusters, percentile))
        print('outliers:', np.count_nonzero(labels == -1))
        print('tot:', len(labels))

32 CLUSTERS, PERCENTILE 75
outliers: 106063
tot: 584729
32 CLUSTERS, PERCENTILE 80
outliers: 80846
tot: 584729
32 CLUSTERS, PERCENTILE 85
outliers: 56788
tot: 584729
32 CLUSTERS, PERCENTILE 90
outliers: 34237
tot: 584729
64 CLUSTERS, PERCENTILE 75
outliers: 108217
tot: 584729
64 CLUSTERS, PERCENTILE 80
outliers: 82822
tot: 584729
64 CLUSTERS, PERCENTILE 85
outliers: 58228
tot: 584729
64 CLUSTERS, PERCENTILE 90
outliers: 35115
tot: 584729
128 CLUSTERS, PERCENTILE 75
outliers: 116732
tot: 584729
128 CLUSTERS, PERCENTILE 80
outliers: 91009
tot: 584729
128 CLUSTERS, PERCENTILE 85
outliers: 65421
tot: 584729
128 CLUSTERS, PERCENTILE 90
outliers: 40812
tot: 584729
256 CLUSTERS, PERCENTILE 75
outliers: 125287
tot: 584729
256 CLUSTERS, PERCENTILE 80
outliers: 98850
tot: 584729
256 CLUSTERS, PERCENTILE 85
outliers: 72744
tot: 584729
256 CLUSTERS, PERCENTILE 90
outliers: 46892
tot: 584729


Show tiles clusters ext

In [None]:
num_tiles_per_cluster = 5
cluster_index = 1
perc_index = 0

print(len(labels_arr[cluster_index][perc_index]))
    
clusters_indexes_arr = []
fig, axs = plt.subplots(n_clusters, num_tiles_per_cluster, figsize=(10,2.5*n_clusters))

for c in range(n_clusters):
    
    fig, axs = plt.subplots(n_clusters, num_tiles_per_cluster, figsize=(10,2.5*n_clusters))

    cluster_index = [i for i, x in enumerate(labels_arr[cluster_index][perc_index]) if x == c]
    
    
    if len(cluster_index) >= num_tiles_per_cluster: 
        random_indexes = np.random.choice(cluster_index, size=num_tiles_per_cluster, replace=False)

        for j, rand_idx in enumerate(random_indexes):
            tile, slide_name = get_tile(rand_idx, 'ext')
            axs[c][j].imshow(tile)
            axs[c][j].axis('off')
            if ext_results_dict[slide_name] == 1:
                positivity = 'pos'
            else:
                positivity = 'neg'
            #axs[c][j].set_title('cluster '+str(c)+" - "+slide_name + ' ' + positivity)
            axs[c][j].set_title('cluster '+str(c)+' - ' + positivity)
    plt.suptitle('Kmeans-{}'.format(n_clusters))
plt.show()

Count samples percentage for each cluster

In [None]:
unique, counts = np.unique(labels_arr[1], return_counts=True)
dict(zip(unique, np.divide(counts*100, len(labels_arr[1]))))