In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import json
from PIL import Image
import pickle
import time

from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

from define_dataset import define_dataset
from define_ext_dataset import define_ext_dataset
from remove_outliers_clustering import find_and_remove_outliers_clustering, remove_outliers_clustering

# Cluster labels subdivision by slide

In [2]:
def subdivide_clusters_by_slide(n_clusters, labels, percentile, dataset='comb_tr'):
    # Labels are still ordered by slide, ie the first 25000 tiles are from the first slide, tile 25001
    # to 70000 is from the second slide and so on
    
    if dataset=='comb_val':
        num_tiles_dict = num_tiles_dict_comb_val
    elif dataset=='comb_ts':
        num_tiles_dict = num_tiles_dict_comb_ts
    else:
        num_tiles_dict = num_tiles_dict_comb_tr

    slide_cluster_pth = "./clustering/comb_ae/slide_clusters_rem_outliers/kmeans{}_out_perc{}".format(n_clusters, percentile)
    if not os.path.exists(slide_cluster_pth):
        os.makedirs(slide_cluster_pth)
    
    # Count tile labels for each slide and save them in same slide folder
    start = 0
    end = 0
    for slide_name in num_tiles_dict:
        slide_num_tiles = num_tiles_dict[slide_name]
        end += slide_num_tiles
        slide_labels = labels[start:end]
        start += slide_num_tiles
        np.save(os.path.join(slide_cluster_pth, slide_name+".npy"), slide_labels)

# Import num tiles dicts

In [3]:
# Dicts with number of tiles for each slide
tiles_num_dict_path = './data/num_tiles_dict_comb_tr.json'
with open(tiles_num_dict_path) as json_file:
    num_tiles_dict_comb_tr = json.load(json_file)
    
tiles_num_dict_path = './data/num_tiles_dict_comb_val.json'
with open(tiles_num_dict_path) as json_file:
    num_tiles_dict_comb_val = json.load(json_file)

tiles_num_dict_path = './data/num_tiles_dict_comb_ts.json'
with open(tiles_num_dict_path) as json_file:
    num_tiles_dict_comb_ts = json.load(json_file)

In [4]:
 def get_tile(index, dataset='tr'):
    tiles_path='../WSI/tiles/'
    curr_tiles_sum = 0
    
    if dataset=='ts':
        num_tiles_dict = num_tiles_dict_ts
    elif dataset=='ext':
        num_tiles_dict = num_tiles_dict_ext
    else:
        num_tiles_dict = num_tiles_dict_tr

    for slide_name in num_tiles_dict:
        if index < curr_tiles_sum + num_tiles_dict[slide_name]:
            selected_slide_name = slide_name
            break
        curr_tiles_sum += num_tiles_dict[slide_name]

    tile_idx = index - curr_tiles_sum

    tile_path = os.path.join(tiles_path, selected_slide_name, str(tile_idx) + '.jpg')

    img = Image.open(tile_path)


    return img, selected_slide_name

# Import tile embeddings

In [5]:
epoch = 20
save_model_path = './ae_models/comb_ae/'

embed_path_comb_tr = os.path.join(save_model_path, 'z_ae_test_epoch{}.npy'.format(epoch))
embed_tiles_comb_tr = np.load(embed_path_comb_tr)

embed_path_comb_val = os.path.join(save_model_path, 'z_ae_internal_test_epoch{}.npy'.format(epoch))
embed_tiles_comb_val = np.load(embed_path_comb_val)

embed_path_comb_ts = os.path.join(save_model_path, 'z_ae_ext_test_epoch{}.npy'.format(epoch))
embed_tiles_comb_ts = np.load(embed_path_comb_ts)

# Remove Outliers from clustering and find max in Tr set

In [6]:
n_clusters_arr = [32, 64, 128, 256]

In [7]:
perc_arr = [75, 80, 85, 90]

Remove from each cluster the tiles with distance from centroid > selceted percentile

In [8]:
# 3d array of labels. dims=(n_clusters, num_percentiles, num_labels)
labels_arr = []

for i, n_clusters in enumerate(n_clusters_arr):
    kmeans_pth = os.path.join("./clustering", "comb_ae", "kmeans{}".format(n_clusters))
    kmeans = pickle.load(open(kmeans_pth, 'rb'))
    
    labels_arr.append([])

    for percentile in perc_arr:
        # Find max distance for each cluster based on percentile
        clusters_max_dist_arr, new_labels = find_and_remove_outliers_clustering(
            kmeans.labels_, embed_tiles_comb_tr, n_clusters, kmeans.cluster_centers_, percentile) 

        labels_arr[i].append(new_labels)

        # Subdivide all tiles based by original slide
        subdivide_clusters_by_slide(n_clusters, new_labels, percentile)

        # Save max distances for each cluster
        max_dist_cluster_pth = os.path.join("./clustering", "comb_ae", "max_dist_outliers")
        if not os.path.exists(max_dist_cluster_pth):
            os.makedirs(max_dist_cluster_pth)
        np.save(max_dist_cluster_pth+"/max_dist_outliers{}_{}.npy".format(percentile, n_clusters), clusters_max_dist_arr)
        print('Finished {} clusters, percentile {}'.format(n_clusters, percentile))

Finished 32 clusters, percentile 75
Finished 32 clusters, percentile 80
Finished 32 clusters, percentile 85
Finished 32 clusters, percentile 90
Finished 64 clusters, percentile 75
Finished 64 clusters, percentile 80
Finished 64 clusters, percentile 85
Finished 64 clusters, percentile 90
Finished 128 clusters, percentile 75
Finished 128 clusters, percentile 80
Finished 128 clusters, percentile 85
Finished 128 clusters, percentile 90
Finished 256 clusters, percentile 75
Finished 256 clusters, percentile 80
Finished 256 clusters, percentile 85
Finished 256 clusters, percentile 90


Count outlier tiles

In [9]:
for i, n_clusters in enumerate(n_clusters_arr):
    print('KMEANS:', n_clusters)
    for j, percentile in enumerate(perc_arr):
        print('PERC:', percentile)
        labels = labels_arr[i][j]
        print('outliers:', np.count_nonzero(labels == -1))
        print('tot:', len(labels))

KMEANS: 32
PERC: 75
outliers: 335466
tot: 1341850
PERC: 80
outliers: 268378
tot: 1341850
PERC: 85
outliers: 201288
tot: 1341850
PERC: 90
outliers: 134196
tot: 1341850
KMEANS: 64
PERC: 75
outliers: 335472
tot: 1341850
PERC: 80
outliers: 268384
tot: 1341850
PERC: 85
outliers: 201299
tot: 1341850
PERC: 90
outliers: 134208
tot: 1341850
KMEANS: 128
PERC: 75
outliers: 335468
tot: 1341850
PERC: 80
outliers: 268396
tot: 1341850
PERC: 85
outliers: 201319
tot: 1341850
PERC: 90
outliers: 134228
tot: 1341850
KMEANS: 256
PERC: 75
outliers: 335487
tot: 1341850
PERC: 80
outliers: 268417
tot: 1341850
PERC: 85
outliers: 201366
tot: 1341850
PERC: 90
outliers: 134265
tot: 1341850


# Validation set

In [10]:
n_clusters_arr = [32, 64, 128, 256]

In [11]:
perc_arr = [75, 80, 85, 90]

In [12]:
# 3d array of labels. dims=(n_clusters, num_percentiles, num_labels)
labels_arr = []

for i, n_clusters in enumerate(n_clusters_arr):
    print('Doing {} clusters'.format(n_clusters))
    
    labels_arr.append([])
    
    # Recover kmeans model
    kmeans_pth = os.path.join("./clustering", "comb_ae", "kmeans{}".format(n_clusters))
    kmeans = pickle.load(open(kmeans_pth, 'rb'))
    
    # Predict labels
    labels = kmeans.predict(embed_tiles_comb_val)
        
    for percentile in perc_arr:
        # Recover max regarding a percentile
        max_dist_cluster_pth = os.path.join(
            "./clustering", "comb_ae", "max_dist_outliers", "max_dist_outliers{}_{}.npy".format(percentile, n_clusters))
        clusters_max_dist_arr = np.load(max_dist_cluster_pth)

        # Apply removal of tiles with distance over max
        new_labels = remove_outliers_clustering(clusters_max_dist_arr,
            labels, embed_tiles_comb_val, n_clusters, kmeans.cluster_centers_) 

        labels_arr[i].append(new_labels)
        # Subdivide all tiles based by original slide
        subdivide_clusters_by_slide(n_clusters, new_labels, percentile, dataset='comb_val')

        print('Finished {} clusters, percentile {}'.format(n_clusters, percentile))

Doing 32 clusters
Finished 32 clusters, percentile 75
Finished 32 clusters, percentile 80
Finished 32 clusters, percentile 85
Finished 32 clusters, percentile 90
Doing 64 clusters
Finished 64 clusters, percentile 75
Finished 64 clusters, percentile 80
Finished 64 clusters, percentile 85
Finished 64 clusters, percentile 90
Doing 128 clusters
Finished 128 clusters, percentile 75
Finished 128 clusters, percentile 80
Finished 128 clusters, percentile 85
Finished 128 clusters, percentile 90
Doing 256 clusters
Finished 256 clusters, percentile 75
Finished 256 clusters, percentile 80
Finished 256 clusters, percentile 85
Finished 256 clusters, percentile 90


In [13]:
for i, n_clusters in enumerate(n_clusters_arr):
    for j, percentile in enumerate(perc_arr):
        labels = labels_arr[i][j]
        print('{} CLUSTERS, PERCENTILE {}'.format(n_clusters, percentile))
        print('outliers:', np.count_nonzero(labels == -1))
        print('tot:', len(labels))

32 CLUSTERS, PERCENTILE 75
outliers: 114917
tot: 428559
32 CLUSTERS, PERCENTILE 80
outliers: 92305
tot: 428559
32 CLUSTERS, PERCENTILE 85
outliers: 69045
tot: 428559
32 CLUSTERS, PERCENTILE 90
outliers: 45629
tot: 428559
64 CLUSTERS, PERCENTILE 75
outliers: 109880
tot: 428559
64 CLUSTERS, PERCENTILE 80
outliers: 87553
tot: 428559
64 CLUSTERS, PERCENTILE 85
outliers: 65295
tot: 428559
64 CLUSTERS, PERCENTILE 90
outliers: 43392
tot: 428559
128 CLUSTERS, PERCENTILE 75
outliers: 111069
tot: 428559
128 CLUSTERS, PERCENTILE 80
outliers: 88803
tot: 428559
128 CLUSTERS, PERCENTILE 85
outliers: 66286
tot: 428559
128 CLUSTERS, PERCENTILE 90
outliers: 43858
tot: 428559
256 CLUSTERS, PERCENTILE 75
outliers: 113312
tot: 428559
256 CLUSTERS, PERCENTILE 80
outliers: 90847
tot: 428559
256 CLUSTERS, PERCENTILE 85
outliers: 67774
tot: 428559
256 CLUSTERS, PERCENTILE 90
outliers: 44538
tot: 428559


# Test set

In [14]:
n_clusters_arr = [32, 64, 128, 256]

In [15]:
perc_arr = [75, 80, 85, 90]

In [16]:
# 3d array of labels. dims=(n_clusters, num_percentiles, num_labels)
labels_arr = []

for i, n_clusters in enumerate(n_clusters_arr):
    print('Doing {} clusters'.format(n_clusters))
    
    # Recover kmeans model
    kmeans_pth = os.path.join("./clustering", "comb_ae", "kmeans{}".format(n_clusters))
    kmeans = pickle.load(open(kmeans_pth, 'rb'))
    
    labels_arr.append([])
    
    # Predict labels
    labels = kmeans.predict(embed_tiles_comb_ts)
    
    unique, counts = np.unique(labels, return_counts=True)
    dict(zip(unique, counts))
    
    for percentile in perc_arr:
        # Recover max regarding a percentile
        max_dist_cluster_pth = os.path.join(
            "./clustering", "comb_ae", "max_dist_outliers", "max_dist_outliers{}_{}.npy".format(percentile, n_clusters))
        clusters_max_dist_arr = np.load(max_dist_cluster_pth)

        # Apply removal of tiles with distance over max
        new_labels = remove_outliers_clustering(clusters_max_dist_arr,
            labels, embed_tiles_comb_ts, n_clusters, kmeans.cluster_centers_) 

        labels_arr[i].append(new_labels)
        # Subdivide all tiles based by original slide
        subdivide_clusters_by_slide(n_clusters, new_labels, percentile, dataset='comb_ts')

        print('Finished {} clusters, percentile {}'.format(n_clusters, percentile))

Doing 32 clusters
Finished 32 clusters, percentile 75
Finished 32 clusters, percentile 80
Finished 32 clusters, percentile 85
Finished 32 clusters, percentile 90
Doing 64 clusters
Finished 64 clusters, percentile 75
Finished 64 clusters, percentile 80
Finished 64 clusters, percentile 85
Finished 64 clusters, percentile 90
Doing 128 clusters
Finished 128 clusters, percentile 75
Finished 128 clusters, percentile 80
Finished 128 clusters, percentile 85
Finished 128 clusters, percentile 90
Doing 256 clusters
Finished 256 clusters, percentile 75
Finished 256 clusters, percentile 80
Finished 256 clusters, percentile 85
Finished 256 clusters, percentile 90


Count outlier tiles

In [17]:
for i, n_clusters in enumerate(n_clusters_arr):
    for j, percentile in enumerate(perc_arr):
        labels = labels_arr[i][j]
        print('{} CLUSTERS, PERCENTILE {}'.format(n_clusters, percentile))
        print('outliers:', np.count_nonzero(labels == -1))
        print('tot:', len(labels))

32 CLUSTERS, PERCENTILE 75
outliers: 108569
tot: 354009
32 CLUSTERS, PERCENTILE 80
outliers: 87071
tot: 354009
32 CLUSTERS, PERCENTILE 85
outliers: 65178
tot: 354009
32 CLUSTERS, PERCENTILE 90
outliers: 42618
tot: 354009
64 CLUSTERS, PERCENTILE 75
outliers: 107544
tot: 354009
64 CLUSTERS, PERCENTILE 80
outliers: 86081
tot: 354009
64 CLUSTERS, PERCENTILE 85
outliers: 64134
tot: 354009
64 CLUSTERS, PERCENTILE 90
outliers: 41957
tot: 354009
128 CLUSTERS, PERCENTILE 75
outliers: 109419
tot: 354009
128 CLUSTERS, PERCENTILE 80
outliers: 88747
tot: 354009
128 CLUSTERS, PERCENTILE 85
outliers: 66965
tot: 354009
128 CLUSTERS, PERCENTILE 90
outliers: 44226
tot: 354009
256 CLUSTERS, PERCENTILE 75
outliers: 111091
tot: 354009
256 CLUSTERS, PERCENTILE 80
outliers: 90297
tot: 354009
256 CLUSTERS, PERCENTILE 85
outliers: 68366
tot: 354009
256 CLUSTERS, PERCENTILE 90
outliers: 45178
tot: 354009


Count samples percentage for each cluster

In [18]:
unique, counts = np.unique(labels_arr[1], return_counts=True)
dict(zip(unique, np.divide(counts*100, len(labels_arr[1]))))

{-1: 7492900.0,
 0: 199350.0,
 1: 613025.0,
 2: 564300.0,
 3: 519225.0,
 4: 190300.0,
 5: 949900.0,
 6: 394625.0,
 7: 503600.0,
 8: 659250.0,
 9: 330050.0,
 10: 341950.0,
 11: 192450.0,
 12: 293650.0,
 13: 978775.0,
 14: 503325.0,
 15: 282250.0,
 16: 125.0,
 17: 43500.0,
 18: 776600.0,
 19: 447825.0,
 20: 256650.0,
 21: 657975.0,
 22: 218225.0,
 23: 278425.0,
 24: 432650.0,
 25: 597050.0,
 26: 668525.0,
 27: 290875.0,
 28: 1302775.0,
 29: 295475.0,
 30: 246425.0,
 31: 457975.0,
 32: 527200.0,
 33: 93475.0,
 34: 375100.0,
 35: 292275.0,
 36: 334750.0,
 37: 438350.0,
 38: 405875.0,
 39: 819100.0,
 40: 419300.0,
 41: 314000.0,
 42: 272525.0,
 43: 404475.0,
 44: 287975.0,
 45: 960650.0,
 46: 207100.0,
 47: 286100.0,
 48: 314450.0,
 49: 360975.0,
 50: 545600.0,
 51: 201575.0,
 52: 1207750.0,
 53: 446700.0,
 54: 479675.0,
 55: 195625.0,
 56: 227900.0,
 57: 836175.0,
 58: 478225.0,
 59: 100.0,
 60: 510250.0,
 61: 279325.0,
 62: 382650.0,
 63: 515675.0}