In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import json
from PIL import Image
import pickle
import time


from sklearn.model_selection import cross_val_score, KFold, cross_val_score, StratifiedKFold
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, make_scorer
from sklearn.preprocessing import normalize

from define_dataset import define_dataset
from define_ext_dataset import define_ext_dataset
from ml_models import build_SVM, build_random_forest

In [2]:
dataset = define_dataset()
tr_set = dataset.tr_set
test_set = dataset.test_set
results_dict = dataset.results_dict
num_positives = dataset.num_positives
num_negatives = dataset.num_negatives

In [3]:
ext_dataset = define_ext_dataset()

ext_ts_set = ext_dataset.data
ext_results_dict = ext_dataset.results_dict
num_positive_ext = ext_dataset.num_positives
num_negatives_ext = ext_dataset.num_negatives
batch1_ext = ext_dataset.batch1
batch2_ext = ext_dataset.batch2
batch3_ext = ext_dataset.batch3

In [4]:
targets_tr = np.zeros(np.shape(tr_set), dtype='int')
for i, slide_name in enumerate(tr_set):
    targets_tr[i] = results_dict[slide_name]
    
targets_ts = np.zeros(np.shape(test_set), dtype='int')
for i, slide_name in enumerate(test_set):
    targets_ts[i] = results_dict[slide_name]

# Import slide tile labels

In [5]:
def import_slide_labels(n_clusters, percentile):
    slide_labels_tr = []
    slide_labels_ts = []
    slide_labels_ext = []

    slide_cluster_pth = "./clustering/ae/slide_clusters_rem_outliers/kmeans{}_out_perc{}".format(
        n_clusters, percentile)

    for slide_name in tr_set:
        slide_labels_tr.append(np.load(os.path.join(slide_cluster_pth, slide_name+".npy")))
        
    for slide_name in test_set:
        slide_labels_ts.append(np.load(os.path.join(slide_cluster_pth, slide_name+".npy")))
        
    for slide_name in ext_ts_set:
        slide_labels_ext.append(np.load(os.path.join(slide_cluster_pth, slide_name+".npy")))
    
    return slide_labels_tr, slide_labels_ts, slide_labels_ext

# Calculate num tiles per cluster

For each slide, it is calculated the number of tiles belonging to each cluster and then normalized
and remove outliers (label=-1)

In [6]:
def count_tiles_per_cluster(slide_labels, n_clusters, dataset):
    clusters_count = []

    for i, slide_name in enumerate(dataset):
        count = np.zeros(n_clusters, dtype=np.int_)
        
        for j in range(n_clusters):
             count[j] = np.count_nonzero(slide_labels[i] == j)
        # Normalization
        norm_count = normalize(count.reshape(1, -1), norm='l1').reshape(-1)
        
        clusters_count.append(norm_count)

    return clusters_count

# Train and test models

Random forest and SVM are used

In [7]:
n_clusters_arr = [32, 64, 128, 256]
perc_arr = [75, 80, 85, 90]

In [8]:
for n_clusters in n_clusters_arr:
    
    for percentile in perc_arr:
        print('NUM CLUSTERS:', n_clusters)
        print('PERCENTILE:', percentile)

        slide_labels_tr, slide_labels_ts, _ = import_slide_labels(n_clusters, percentile)
        clusters_count_tr = count_tiles_per_cluster(slide_labels_tr, n_clusters, tr_set)
        clusters_count_ts = count_tiles_per_cluster(slide_labels_ts, n_clusters, test_set)

        # SVM
        print('--- SVM ---')
        print('Training:')
        best_score_svm, best_svm = build_SVM(X=clusters_count_tr, y=targets_tr)
        print('Internal Test:')
        pred = best_svm.predict(clusters_count_ts)
        acc_svm = accuracy_score(y_true=targets_ts, y_pred=pred)
        f1_svm = f1_score(y_true=targets_ts, y_pred=pred)
        f1_weight_svm = f1_score(y_true=targets_ts, y_pred=pred, average="weighted")

        print('acc_score_svm:', acc_svm)
        print('f1_score_svm:', f1_svm)
        print('f1_weight_score_svm:', f1_weight_svm)
        print('\n')

        # Random forest
        print('--- RANDOM FOREST ---')
        print('Training:')
        best_score_rf, best_rf = build_random_forest(X=clusters_count_tr, y=targets_tr)
        print('Internal Test:')
        pred = best_rf.predict(clusters_count_ts)
        acc_rf = accuracy_score(y_true=targets_ts, y_pred=pred)
        f1_rf = f1_score(y_true=targets_ts, y_pred=pred)
        f1_weight_rf = f1_score(y_true=targets_ts, y_pred=pred, average="weighted")

        print('acc_score_rf:', acc_rf)
        print('f1_score_rf:', f1_rf)
        print('f1_weight_score_rf:', f1_weight_rf)

        # Save models
        models_pth = './classification/class_ae_cluster_rem_out/'
        if not os.path.exists(models_pth):
            os.makedirs(models_pth)
        svm_pth = models_pth+'/svm_{}_out{}'.format(n_clusters, percentile)
        rf_pth = models_pth+'/rf_{}_out{}'.format(n_clusters, percentile)

        pickle.dump(best_svm, open(svm_pth, 'wb'))
        pickle.dump(best_rf, open(rf_pth, 'wb'))

        print('\n')


NUM CLUSTERS: 32
PERCENTILE: 75
--- SVM ---
Training:
SVM - Best parameters set found:
{'C': 10, 'gamma': 'scale', 'kernel': 'linear'}
SVM - Best accuracy score found:
0.5833333333333334
Internal Test:
acc_score_svm: 0.5
f1_score_svm: 0.0
f1_weight_score_svm: 0.3333333333333333


--- RANDOM FOREST ---
Training:
RF - Best parameters set found:
{'bootstrap': False, 'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'n_estimators': 256}
RF - Best accuraccy score found:
0.861111111111111
Internal Test:
acc_score_rf: 1.0
f1_score_rf: 1.0
f1_weight_score_rf: 1.0


NUM CLUSTERS: 32
PERCENTILE: 80
--- SVM ---
Training:
SVM - Best parameters set found:
{'C': 10, 'gamma': 'scale', 'kernel': 'linear'}
SVM - Best accuracy score found:
0.5833333333333334
Internal Test:
acc_score_svm: 0.6666666666666666
f1_score_svm: 0.5
f1_weight_score_svm: 0.6249999999999999


--- RANDOM FOREST ---
Training:
RF - Best parameters set found:
{'bootstrap': False, 'max_depth': 100, 'max_features': 'auto

# External Test

Import slide targets

In [9]:
targets_ext = np.zeros(np.shape(ext_ts_set), dtype='int')
for i, slide_name in enumerate(ext_ts_set):
    targets_ext[i] = ext_results_dict[slide_name]

In [10]:
n_clusters_arr = [32, 64, 128, 256]
perc_arr = [75, 80, 85, 90]

Make predictions on clustered tiles of external test set

In [11]:
print('EXTERNAL TEST\n')

for n_clusters in n_clusters_arr:
    for percentile in perc_arr:
        print('NUM CLUSTERS:', n_clusters)
        print('PERCENTILE:', percentile)

    
        _, _, slide_labels_ext = import_slide_labels(n_clusters, percentile)
        clusters_count_ext = count_tiles_per_cluster(slide_labels_ext, n_clusters, ext_ts_set)

        svm_pth = './classification/class_ae_cluster_rem_out/svm_{}_out{}'.format(n_clusters, percentile)
        rf_pth = './classification/class_ae_cluster_rem_out/rf_{}_out{}'.format(n_clusters, percentile)


        # SVM
        print('SVM')
        trained_svm = pickle.load(open(svm_pth, 'rb'))
        pred = trained_svm.predict(clusters_count_ext)
        acc_svm = accuracy_score(y_true=targets_ext, y_pred=pred)
        f1_svm = f1_score(y_true=targets_ext, y_pred=pred)
        f1_weight_svm = f1_score(y_true=targets_ext, y_pred=pred, average="weighted")

        print('acc_score_svm:', acc_svm)
        print('f1_score_svm:', f1_svm)
        print('f1_weight_score_svm:', f1_weight_svm)
        print(pred)
        print(targets_ext)

        # Random forest
        print('RANDOM FOREST')
        trained_rf = pickle.load(open(rf_pth, 'rb'))
        pred = trained_rf.predict(clusters_count_ext)
        acc_rf = accuracy_score(y_true=targets_ext, y_pred=pred)
        f1_rf = f1_score(y_true=targets_ext, y_pred=pred)
        f1_weight_rf = f1_score(y_true=targets_ext, y_pred=pred, average="weighted")

        print('acc_score_rf:', acc_rf)
        print('f1_score_rf:', f1_rf)
        print('f1_weight_score_rf:', f1_weight_rf)
        print(pred)
        print(targets_ext)
        print('\n')

EXTERNAL TEST

NUM CLUSTERS: 32
PERCENTILE: 75
SVM
acc_score_svm: 0.84
f1_score_svm: 0.0
f1_weight_score_svm: 0.7669565217391304
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
RANDOM FOREST
acc_score_rf: 0.84
f1_score_rf: 0.3333333333333333
f1_weight_score_rf: 0.8169696969696968
[0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0]
[1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


NUM CLUSTERS: 32
PERCENTILE: 80
SVM
acc_score_svm: 0.84
f1_score_svm: 0.0
f1_weight_score_svm: 0.7669565217391304
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
RANDOM FOREST
acc_score_rf: 0.8
f1_score_rf: 0.28571428571428575
f1_weight_score_rf: 0.7880398671096346
[0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0]
[1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


NUM CLUSTERS: 32
PERCENTILE: 85
SVM
acc_score_svm: 0.84
f1_score_svm: 0.0
f1_weight_score_svm: 0.7669565217391304
[0 0 0 0 0 0 0 