In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
import json
from PIL import Image
import pickle
import time


from sklearn.model_selection import cross_val_score, KFold, cross_val_score, StratifiedKFold
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, make_scorer
from sklearn.preprocessing import normalize

from define_dataset import define_dataset
from define_ext_dataset import define_ext_dataset
from ml_models import build_SVM, build_random_forest

# Import slide targets

In [2]:
comb_tr_set = np.load('./data/comb_tr_set.npy')
comb_val_set = np.load('./data/comb_val_set.npy')
comb_ts_set = np.load('./data/comb_ts_set.npy')
targets_comb_tr = np.load('./data/comb_tr_targets.npy')
targets_comb_val = np.load('./data/comb_val_targets.npy')
targets_comb_ts = np.load('./data/comb_ts_targets.npy')

# Import slide tile labels

In [3]:
def import_slide_labels(n_clusters, percentile):
    slide_labels_comb_tr = []
    slide_labels_comb_val = []
    slide_labels_comb_ts = []

    slide_cluster_pth = "./clustering/comb_ae/slide_clusters_rem_outliers/kmeans{}_out_perc{}".format(n_clusters, percentile)


    for slide_name in comb_tr_set:
        slide_labels_comb_tr.append(np.load(os.path.join(slide_cluster_pth, slide_name+".npy")))
        
    for slide_name in comb_val_set:
        slide_labels_comb_val.append(np.load(os.path.join(slide_cluster_pth, slide_name+".npy")))
        
    for slide_name in comb_ts_set:
        slide_labels_comb_ts.append(np.load(os.path.join(slide_cluster_pth, slide_name+".npy")))
    
    return slide_labels_comb_tr, slide_labels_comb_val, slide_labels_comb_ts

# Calculate num tiles per cluster

For each slide, it is calculated the number of tiles belonging to each cluster and then normalized

In [4]:
def count_tiles_per_cluster(slide_labels, n_clusters, dataset):
    clusters_count = []

    for i, slide_name in enumerate(dataset):
        count = np.zeros(n_clusters, dtype=np.int_)
        for j in range(n_clusters):
             count[j] = np.count_nonzero(slide_labels[i] == j)
        # Normalization
        norm_count = normalize(count.reshape(1, -1), norm='l1').reshape(-1)
        
        clusters_count.append(norm_count)

    return clusters_count

# Train and validate models

Random forest and SVM are used

In [5]:
n_clusters_arr = [32, 64, 128, 256]
perc_arr = [75, 80, 85, 90]

In [6]:
for n_clusters in n_clusters_arr:
        
    for percentile in perc_arr:
        print('NUM CLUSTERS:', n_clusters)
        print('PERCENTILE:', percentile)
        
        slide_labels_comb_tr, slide_labels_comb_val, _ = import_slide_labels(n_clusters, percentile)


        clusters_count_tr = count_tiles_per_cluster(slide_labels_comb_tr, n_clusters, comb_tr_set)
        clusters_count_val = count_tiles_per_cluster(slide_labels_comb_val, n_clusters, comb_val_set)

        # SVM
        print('--- SVM ---')
        print('Training:')
        best_score_svm, best_svm = build_SVM(X=clusters_count_tr, y=targets_comb_tr)
        print('Validation:')
        pred = best_svm.predict(clusters_count_val)
        acc_svm = accuracy_score(y_true=targets_comb_val, y_pred=pred)
        f1_svm = f1_score(y_true=targets_comb_val, y_pred=pred)
        f1_weight_svm = f1_score(y_true=targets_comb_val, y_pred=pred, average="weighted")

        print('acc_score_svm:', acc_svm)
        print('f1_score_svm:', f1_svm)
        print('f1_weight_score_svm:', f1_weight_svm)
        print('\n')

        # Random forest
        print('--- RANDOM FOREST ---')
        print('Training:')
        best_score_rf, best_rf = build_random_forest(X=clusters_count_tr, y=targets_comb_tr)
        print('Validationt:')
        pred = best_rf.predict(clusters_count_val)
        acc_rf = accuracy_score(y_true=targets_comb_val, y_pred=pred)
        f1_rf = f1_score(y_true=targets_comb_val, y_pred=pred)
        f1_weight_rf = f1_score(y_true=targets_comb_val, y_pred=pred, average="weighted")

        print('acc_score_rf:', acc_rf)
        print('f1_score_rf:', f1_rf)
        print('f1_weight_score_rf:', f1_weight_rf)

        # Save models
        models_pth = './classification/class_comb_ae_cluster_rem_out/'
        if not os.path.exists(models_pth):
            os.makedirs(models_pth)
        svm_pth = os.path.join(models_pth, 'svm_{}_out{}'.format(n_clusters, percentile))
        rf_pth = os.path.join(models_pth, 'rf_{}_out{}'.format(n_clusters, percentile))

        pickle.dump(best_svm, open(svm_pth, 'wb'))
        pickle.dump(best_rf, open(rf_pth, 'wb'))

        print('\n')


NUM CLUSTERS: 32
PERCENTILE: 75
--- SVM ---
Training:
SVM - Best parameters set found:
{'C': 10, 'degree': 2, 'gamma': 'scale', 'kernel': 'poly'}
SVM - Best accuracy score found:
0.8402777777777777
Validation:
acc_score_svm: 0.6666666666666666
f1_score_svm: 0.6
f1_weight_score_svm: 0.6761904761904761


--- RANDOM FOREST ---
Training:
RF - Best parameters set found:
{'bootstrap': True, 'max_depth': 100, 'max_features': 'auto', 'min_samples_leaf': 1, 'n_estimators': 64}
RF - Best accuraccy score found:
0.7916666666666666
Validationt:
acc_score_rf: 0.6666666666666666
f1_score_rf: 0.5
f1_weight_score_rf: 0.6666666666666666


NUM CLUSTERS: 32
PERCENTILE: 80
--- SVM ---
Training:
SVM - Best parameters set found:
{'C': 10, 'degree': 2, 'gamma': 'scale', 'kernel': 'poly'}
SVM - Best accuracy score found:
0.8194444444444443
Validation:
acc_score_svm: 0.6666666666666666
f1_score_svm: 0.6
f1_weight_score_svm: 0.6761904761904761


--- RANDOM FOREST ---
Training:
RF - Best parameters set found:
{'b

# Test

In [7]:
n_clusters_arr = [32, 64, 128, 256]
perc_arr = [75, 80, 85, 90]

Make predictions on clustered tiles on test set

In [8]:
print('TEST\n')

for n_clusters in n_clusters_arr:
    for percentile in perc_arr:
        print('NUM CLUSTERS:', n_clusters)
        print('PERCENTILE:', percentile)

        _, _, slide_labels_comb_ts = import_slide_labels(n_clusters, percentile)

        clusters_count_ts = count_tiles_per_cluster(slide_labels_comb_ts, n_clusters, comb_ts_set)

        models_pth = './classification/class_comb_ae_cluster_rem_out/'
        svm_pth = os.path.join(models_pth, 'svm_{}_out{}'.format(n_clusters, percentile))
        rf_pth = os.path.join(models_pth, 'rf_{}_out{}'.format(n_clusters, percentile))


        # SVM
        print('SVM')
        trained_svm = pickle.load(open(svm_pth, 'rb'))
        pred = trained_svm.predict(clusters_count_ts)
        acc_svm = accuracy_score(y_true=targets_comb_ts, y_pred=pred)
        f1_svm = f1_score(y_true=targets_comb_ts, y_pred=pred)
        f1_weight_svm = f1_score(y_true=targets_comb_ts, y_pred=pred, average="weighted")

        print('acc_score_svm:', acc_svm)
        print('f1_score_svm:', f1_svm)
        print('f1_weight_score_svm:', f1_weight_svm)
        print(pred)
        print(targets_comb_ts)

        # Random forest
        print('RANDOM FOREST')
        trained_rf = pickle.load(open(rf_pth, 'rb'))
        pred = trained_rf.predict(clusters_count_ts)
        acc_rf = accuracy_score(y_true=targets_comb_ts, y_pred=pred)
        f1_rf = f1_score(y_true=targets_comb_ts, y_pred=pred)
        f1_weight_rf = f1_score(y_true=targets_comb_ts, y_pred=pred, average="weighted")

        print('acc_score_rf:', acc_rf)
        print('f1_score_rf:', f1_rf)
        print('f1_weight_score_rf:', f1_weight_rf)
        print(pred)
        print(targets_comb_ts)
        print('\n')

TEST

NUM CLUSTERS: 32
PERCENTILE: 75
SVM
acc_score_svm: 0.6666666666666666
f1_score_svm: 0.6
f1_weight_score_svm: 0.6666666666666666
[0 0 1 0 0 1 0 0 1 1 1 0]
[0 0 0 0 1 1 1 0 1 0 1 0]
RANDOM FOREST
acc_score_rf: 0.75
f1_score_rf: 0.7272727272727272
f1_weight_score_rf: 0.7517482517482517
[1 0 0 0 1 1 0 0 1 1 1 0]
[0 0 0 0 1 1 1 0 1 0 1 0]


NUM CLUSTERS: 32
PERCENTILE: 80
SVM
acc_score_svm: 0.6666666666666666
f1_score_svm: 0.6
f1_weight_score_svm: 0.6666666666666666
[0 0 1 0 0 1 0 0 1 1 1 0]
[0 0 0 0 1 1 1 0 1 0 1 0]
RANDOM FOREST
acc_score_rf: 0.8333333333333334
f1_score_rf: 0.8000000000000002
f1_weight_score_rf: 0.8333333333333334
[1 0 0 0 1 1 0 0 1 0 1 0]
[0 0 0 0 1 1 1 0 1 0 1 0]


NUM CLUSTERS: 32
PERCENTILE: 85
SVM
acc_score_svm: 0.6666666666666666
f1_score_svm: 0.6
f1_weight_score_svm: 0.6666666666666666
[0 0 1 0 0 1 0 0 1 1 1 0]
[0 0 0 0 1 1 1 0 1 0 1 0]
RANDOM FOREST
acc_score_rf: 0.8333333333333334
f1_score_rf: 0.8000000000000002
f1_weight_score_rf: 0.8333333333333334
[1 0 0