In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import json
from PIL import Image
import pickle
import time

from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

from define_dataset import define_dataset
from define_ext_dataset import define_ext_dataset
from remove_outliers_clustering import find_and_remove_outliers_clustering, remove_outliers_clustering

# Cluster labels subdivision by slide

In [2]:
def subdivide_clusters_by_slide(n_clusters, labels, percentile, dataset='comb_tr'):
    # Labels are still ordered by slide, ie the first 25000 tiles are from the first slide, tile 25001
    # to 70000 is from the second slide and so on
    
    if dataset=='comb_val':
        num_tiles_dict = num_tiles_dict_comb_val
    elif dataset=='comb_ts':
        num_tiles_dict = num_tiles_dict_comb_ts
    else:
        num_tiles_dict = num_tiles_dict_comb_tr

    slide_cluster_pth = "./clustering/comb_ae/slide_clusters_rem_outliers/kmeans{}_out_perc{}".format(n_clusters, percentile)
    if not os.path.exists(slide_cluster_pth):
        os.makedirs(slide_cluster_pth)
    
    # Count tile labels for each slide and save them in same slide folder
    start = 0
    end = 0
    for slide_name in num_tiles_dict:
        slide_num_tiles = num_tiles_dict[slide_name]
        end += slide_num_tiles
        slide_labels = labels[start:end]
        start += slide_num_tiles
        np.save(os.path.join(slide_cluster_pth, slide_name+".npy"), slide_labels)

# Import num tiles dicts

In [3]:
# Dicts with number of tiles for each slide
tiles_num_dict_path = './data/num_tiles_dict_comb_tr.json'
with open(tiles_num_dict_path) as json_file:
    num_tiles_dict_comb_tr = json.load(json_file)
    
tiles_num_dict_path = './data/num_tiles_dict_comb_val.json'
with open(tiles_num_dict_path) as json_file:
    num_tiles_dict_comb_val = json.load(json_file)

tiles_num_dict_path = './data/num_tiles_dict_comb_ts.json'
with open(tiles_num_dict_path) as json_file:
    num_tiles_dict_comb_ts = json.load(json_file)

In [4]:
 def get_tile(index, dataset='tr'):
    tiles_path='../WSI/tiles/'
    curr_tiles_sum = 0
    
    if dataset=='ts':
        num_tiles_dict = num_tiles_dict_ts
    elif dataset=='ext':
        num_tiles_dict = num_tiles_dict_ext
    else:
        num_tiles_dict = num_tiles_dict_tr

    for slide_name in num_tiles_dict:
        if index < curr_tiles_sum + num_tiles_dict[slide_name]:
            selected_slide_name = slide_name
            break
        curr_tiles_sum += num_tiles_dict[slide_name]

    tile_idx = index - curr_tiles_sum

    tile_path = os.path.join(tiles_path, selected_slide_name, str(tile_idx) + '.jpg')

    img = Image.open(tile_path)


    return img, selected_slide_name

# Import tile embeddings

In [5]:
epoch = 20
save_model_path = './ae_models/comb_ae/'

embed_path_comb_tr = os.path.join(save_model_path, 'z_ae_test_epoch{}.npy'.format(epoch))
embed_tiles_comb_tr = np.load(embed_path_comb_tr)

embed_path_comb_val = os.path.join(save_model_path, 'z_ae_internal_test_epoch{}.npy'.format(epoch))
embed_tiles_comb_val = np.load(embed_path_comb_val)

embed_path_comb_ts = os.path.join(save_model_path, 'z_ae_ext_test_epoch{}.npy'.format(epoch))
embed_tiles_comb_ts = np.load(embed_path_comb_ts)

# Remove Outliers from clustering and find max in Tr set

In [6]:
n_clusters_arr = [32, 64, 128, 256]

In [7]:
perc_arr = [75, 80, 85, 90]

Remove from each cluster the tiles with distance from centroid > selceted percentile

In [8]:
# 3d array of labels. dims=(n_clusters, num_percentiles, num_labels)
labels_arr = []

for i, n_clusters in enumerate(n_clusters_arr):
    kmeans_pth = os.path.join("./clustering", "comb_ae", "kmeans{}".format(n_clusters))
    kmeans = pickle.load(open(kmeans_pth, 'rb'))
    
    labels_arr.append([])

    for percentile in perc_arr:
        # Find max distance for each cluster based on percentile
        clusters_max_dist_arr, new_labels = find_and_remove_outliers_clustering(
            kmeans.labels_, embed_tiles_comb_tr, n_clusters, kmeans.cluster_centers_, percentile) 

        labels_arr[i].append(new_labels)

        # Subdivide all tiles based by original slide
        subdivide_clusters_by_slide(n_clusters, new_labels, percentile)

        # Save max distances for each cluster
        max_dist_cluster_pth = os.path.join("./clustering", "comb_ae", "max_dist_outliers")
        if not os.path.exists(max_dist_cluster_pth):
            os.makedirs(max_dist_cluster_pth)
        np.save(max_dist_cluster_pth+"/max_dist_outliers{}_{}.npy".format(percentile, n_clusters), clusters_max_dist_arr)
        print('Finished {} clusters, percentile {}'.format(n_clusters, percentile))

Finished 32 clusters, percentile 75
Finished 32 clusters, percentile 80
Finished 32 clusters, percentile 85
Finished 32 clusters, percentile 90
Finished 64 clusters, percentile 75
Finished 64 clusters, percentile 80
Finished 64 clusters, percentile 85
Finished 64 clusters, percentile 90
Finished 128 clusters, percentile 75
Finished 128 clusters, percentile 80
Finished 128 clusters, percentile 85
Finished 128 clusters, percentile 90
Finished 256 clusters, percentile 75
Finished 256 clusters, percentile 80
Finished 256 clusters, percentile 85
Finished 256 clusters, percentile 90


Count outlier tiles

In [9]:
for i, n_clusters in enumerate(n_clusters_arr):
    print('KMEANS:', n_clusters)
    for j, percentile in enumerate(perc_arr):
        print('PERC:', percentile)
        labels = labels_arr[i][j]
        print('outliers:', np.count_nonzero(labels == -1))
        print('tot:', len(labels))

KMEANS: 32
PERC: 75
outliers: 323112
tot: 1292425
PERC: 80
outliers: 258491
tot: 1292425
PERC: 85
outliers: 193876
tot: 1292425
PERC: 90
outliers: 129256
tot: 1292425
KMEANS: 64
PERC: 75
outliers: 323114
tot: 1292425
PERC: 80
outliers: 258498
tot: 1292425
PERC: 85
outliers: 193881
tot: 1292425
PERC: 90
outliers: 129267
tot: 1292425
KMEANS: 128
PERC: 75
outliers: 323123
tot: 1292425
PERC: 80
outliers: 258512
tot: 1292425
PERC: 85
outliers: 193910
tot: 1292425
PERC: 90
outliers: 129292
tot: 1292425
KMEANS: 256
PERC: 75
outliers: 323136
tot: 1292425
PERC: 80
outliers: 258540
tot: 1292425
PERC: 85
outliers: 193941
tot: 1292425
PERC: 90
outliers: 129331
tot: 1292425


# Validation set

In [10]:
n_clusters_arr = [32, 64, 128, 256]

In [11]:
perc_arr = [75, 80, 85, 90]

In [12]:
# 3d array of labels. dims=(n_clusters, num_percentiles, num_labels)
labels_arr = []

for i, n_clusters in enumerate(n_clusters_arr):
    print('Doing {} clusters'.format(n_clusters))
    
    labels_arr.append([])
    
    # Recover kmeans model
    kmeans_pth = os.path.join("./clustering", "comb_ae", "kmeans{}".format(n_clusters))
    kmeans = pickle.load(open(kmeans_pth, 'rb'))
    
    # Predict labels
    labels = kmeans.predict(embed_tiles_comb_val)
        
    for percentile in perc_arr:
        # Recover max regarding a percentile
        max_dist_cluster_pth = os.path.join(
            "./clustering", "comb_ae", "max_dist_outliers", "max_dist_outliers{}_{}.npy".format(percentile, n_clusters))
        clusters_max_dist_arr = np.load(max_dist_cluster_pth)

        # Apply removal of tiles with distance over max
        new_labels = remove_outliers_clustering(clusters_max_dist_arr,
            labels, embed_tiles_comb_val, n_clusters, kmeans.cluster_centers_) 

        labels_arr[i].append(new_labels)
        # Subdivide all tiles based by original slide
        subdivide_clusters_by_slide(n_clusters, new_labels, percentile, dataset='comb_val')

        print('Finished {} clusters, percentile {}'.format(n_clusters, percentile))

Doing 32 clusters
Finished 32 clusters, percentile 75
Finished 32 clusters, percentile 80
Finished 32 clusters, percentile 85
Finished 32 clusters, percentile 90
Doing 64 clusters
Finished 64 clusters, percentile 75
Finished 64 clusters, percentile 80
Finished 64 clusters, percentile 85
Finished 64 clusters, percentile 90
Doing 128 clusters
Finished 128 clusters, percentile 75
Finished 128 clusters, percentile 80
Finished 128 clusters, percentile 85
Finished 128 clusters, percentile 90
Doing 256 clusters
Finished 256 clusters, percentile 75
Finished 256 clusters, percentile 80
Finished 256 clusters, percentile 85
Finished 256 clusters, percentile 90


In [13]:
for i, n_clusters in enumerate(n_clusters_arr):
    for j, percentile in enumerate(perc_arr):
        labels = labels_arr[i][j]
        print('{} CLUSTERS, PERCENTILE {}'.format(n_clusters, percentile))
        print('outliers:', np.count_nonzero(labels == -1))
        print('tot:', len(labels))

32 CLUSTERS, PERCENTILE 75
outliers: 118773
tot: 426670
32 CLUSTERS, PERCENTILE 80
outliers: 96087
tot: 426670
32 CLUSTERS, PERCENTILE 85
outliers: 73079
tot: 426670
32 CLUSTERS, PERCENTILE 90
outliers: 50147
tot: 426670
64 CLUSTERS, PERCENTILE 75
outliers: 116471
tot: 426670
64 CLUSTERS, PERCENTILE 80
outliers: 94358
tot: 426670
64 CLUSTERS, PERCENTILE 85
outliers: 71895
tot: 426670
64 CLUSTERS, PERCENTILE 90
outliers: 49242
tot: 426670
128 CLUSTERS, PERCENTILE 75
outliers: 114528
tot: 426670
128 CLUSTERS, PERCENTILE 80
outliers: 92365
tot: 426670
128 CLUSTERS, PERCENTILE 85
outliers: 70291
tot: 426670
128 CLUSTERS, PERCENTILE 90
outliers: 47850
tot: 426670
256 CLUSTERS, PERCENTILE 75
outliers: 113789
tot: 426670
256 CLUSTERS, PERCENTILE 80
outliers: 91795
tot: 426670
256 CLUSTERS, PERCENTILE 85
outliers: 69915
tot: 426670
256 CLUSTERS, PERCENTILE 90
outliers: 47513
tot: 426670


# Test set

In [6]:
n_clusters_arr = [32, 64, 128, 256]

In [7]:
perc_arr = [75, 80, 85, 90]

In [8]:
# 3d array of labels. dims=(n_clusters, num_percentiles, num_labels)
labels_arr = []

for i, n_clusters in enumerate(n_clusters_arr):
    print('Doing {} clusters'.format(n_clusters))
    
    # Recover kmeans model
    kmeans_pth = os.path.join("./clustering", "comb_ae", "kmeans{}".format(n_clusters))
    kmeans = pickle.load(open(kmeans_pth, 'rb'))
    
    labels_arr.append([])
    
    # Predict labels
    labels = kmeans.predict(embed_tiles_comb_ts)
    
    unique, counts = np.unique(labels, return_counts=True)
    dict(zip(unique, counts))
    
    for percentile in perc_arr:
        # Recover max regarding a percentile
        max_dist_cluster_pth = os.path.join(
            "./clustering", "comb_ae", "max_dist_outliers", "max_dist_outliers{}_{}.npy".format(percentile, n_clusters))
        clusters_max_dist_arr = np.load(max_dist_cluster_pth)

        # Apply removal of tiles with distance over max
        new_labels = remove_outliers_clustering(clusters_max_dist_arr,
            labels, embed_tiles_comb_ts, n_clusters, kmeans.cluster_centers_) 

        labels_arr[i].append(new_labels)
        # Subdivide all tiles based by original slide
        subdivide_clusters_by_slide(n_clusters, new_labels, percentile, dataset='comb_ts')

        print('Finished {} clusters, percentile {}'.format(n_clusters, percentile))

Doing 32 clusters
Finished 32 clusters, percentile 75
Finished 32 clusters, percentile 80
Finished 32 clusters, percentile 85
Finished 32 clusters, percentile 90
Doing 64 clusters
Finished 64 clusters, percentile 75
Finished 64 clusters, percentile 80
Finished 64 clusters, percentile 85
Finished 64 clusters, percentile 90
Doing 128 clusters
Finished 128 clusters, percentile 75
Finished 128 clusters, percentile 80
Finished 128 clusters, percentile 85
Finished 128 clusters, percentile 90
Doing 256 clusters
Finished 256 clusters, percentile 75
Finished 256 clusters, percentile 80
Finished 256 clusters, percentile 85
Finished 256 clusters, percentile 90


Count outlier tiles

In [9]:
for i, n_clusters in enumerate(n_clusters_arr):
    for j, percentile in enumerate(perc_arr):
        labels = labels_arr[i][j]
        print('{} CLUSTERS, PERCENTILE {}'.format(n_clusters, percentile))
        print('outliers:', np.count_nonzero(labels == -1))
        print('tot:', len(labels))

32 CLUSTERS, PERCENTILE 75
outliers: 99155
tot: 342323
32 CLUSTERS, PERCENTILE 80
outliers: 78179
tot: 342323
32 CLUSTERS, PERCENTILE 85
outliers: 57101
tot: 342323
32 CLUSTERS, PERCENTILE 90
outliers: 36601
tot: 342323
64 CLUSTERS, PERCENTILE 75
outliers: 98896
tot: 342323
64 CLUSTERS, PERCENTILE 80
outliers: 78178
tot: 342323
64 CLUSTERS, PERCENTILE 85
outliers: 57433
tot: 342323
64 CLUSTERS, PERCENTILE 90
outliers: 37086
tot: 342323
128 CLUSTERS, PERCENTILE 75
outliers: 102131
tot: 342323
128 CLUSTERS, PERCENTILE 80
outliers: 81105
tot: 342323
128 CLUSTERS, PERCENTILE 85
outliers: 59918
tot: 342323
128 CLUSTERS, PERCENTILE 90
outliers: 38783
tot: 342323
256 CLUSTERS, PERCENTILE 75
outliers: 103606
tot: 342323
256 CLUSTERS, PERCENTILE 80
outliers: 82594
tot: 342323
256 CLUSTERS, PERCENTILE 85
outliers: 61425
tot: 342323
256 CLUSTERS, PERCENTILE 90
outliers: 40166
tot: 342323


Count samples percentage for each cluster

In [10]:
unique, counts = np.unique(labels_arr[1], return_counts=True)
dict(zip(unique, np.divide(counts*100, len(labels_arr[1]))))

{-1: 6789825.0,
 0: 197575.0,
 1: 610200.0,
 2: 1507300.0,
 3: 361275.0,
 4: 163525.0,
 5: 491450.0,
 6: 442275.0,
 7: 370200.0,
 8: 370250.0,
 9: 359900.0,
 10: 837550.0,
 11: 621225.0,
 12: 614750.0,
 13: 171825.0,
 14: 172250.0,
 15: 143025.0,
 16: 130425.0,
 17: 270550.0,
 18: 249975.0,
 19: 473825.0,
 20: 314400.0,
 21: 486350.0,
 22: 492700.0,
 23: 492800.0,
 24: 226400.0,
 25: 142575.0,
 26: 495725.0,
 27: 848975.0,
 28: 287200.0,
 29: 273100.0,
 30: 387975.0,
 31: 119750.0,
 32: 1115550.0,
 33: 895850.0,
 34: 505075.0,
 35: 484100.0,
 36: 1440825.0,
 37: 160250.0,
 38: 395900.0,
 39: 184800.0,
 40: 208325.0,
 41: 703475.0,
 42: 451125.0,
 43: 360300.0,
 44: 177375.0,
 45: 150500.0,
 46: 395675.0,
 47: 399900.0,
 48: 348225.0,
 49: 348850.0,
 50: 464800.0,
 51: 175925.0,
 52: 1089025.0,
 53: 441775.0,
 54: 869825.0,
 55: 140100.0,
 56: 363350.0,
 57: 2200.0,
 58: 149325.0,
 59: 510450.0,
 60: 298175.0,
 61: 228075.0,
 62: 621150.0,
 63: 234900.0}