# Classification of DIME embeddings of WSI patches

### By Elsa Jonsson, Embeddings provided by AstraZeneca

In this notebook i will take DIME embeddings of tumour patches of the Camelyon16 dataset and see the patch level binary classification prediction of three different simple classifiers, SVM, LP and MLP. 

In [1]:
import skimage, sklearn, matplot, numpy, scipy, pandas, virtualenv, pickle, os, glob

import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_curve, auc, average_precision_score
from sklearn import metrics

## Load & inspect data

In [2]:
folder = 'diverse/simclr/light/no_cutout/train_test/ShuffledTest00_50_ShuffledPartitions3_0/plots/epoch_-1_train_fullSize/'

In [3]:
def load_files(folder, wanted_content):
    content = []
    for root, dirs, files in os.walk(folder):
        for name in files:
            #classes
            if name.endswith('.npy') and name == wanted_content and name != "classes.npy":
                file = numpy.load(os.path.join(root, name))
                for array in file:
                    content.append(array)
            #embeddings
            elif name.endswith('.npy') and name == wanted_content:
                file = numpy.load(os.path.join(root, name))
                for array in file:
                    for label in array:
                        content.append(label)
            #load information about patches
            elif name.endswith('.pkl') and name == wanted_content:
                with open(os.path.join(root, name), 'rb') as f:
                    content = pickle.load(f)
            elif name.endswith('.png') and name == wanted_content:
                content = mpimg.imread(os.path.join(root, name))
    return content

def load_data(folder):
    embeddings = load_files(folder, 'embeddings.npy')
    classes = load_files(folder, 'classes.npy')
    patch_information = load_files(folder, 'paths.pkl')
    image = load_files(folder, 'umap.png')
    return embeddings, classes, patch_information, image

def print_data(i, embeddings, classes, patch_information):    
    print('---------------- EMBEDDING -----------------------')
    print(embeddings[i])
    print('---------------- CLASS -----------------------')
    print(classes[i])
    print('---------------- PATCH INFORMATION -----------------------')
    print(patch_information[i])

def plot_image(img):
    print('---------------- PLOT -----------------------')
    imgplot = plt.imshow(img)
    plt.show()

def show_all(index, embeddings, classes, patch_information, img):
    print_data(index, embeddings, classes, patch_information)
    if img != []:
        plot_image(img)
    print('---------------------------------------')

In [4]:
embeddings, classes, patch_information, image = load_data(folder)

In [5]:
show_all(554, embeddings, classes, patch_information, image)

---------------- EMBEDDING -----------------------
[2.40123253e-02 1.81285143e-02 0.00000000e+00 2.01333649e-02
 2.39383560e-02 1.28672054e-05 1.99968331e-02 1.04077440e-02
 4.84503992e-03 4.16006288e-03 6.03996329e-02 7.30418507e-03
 2.57804971e-02 5.45886569e-02 1.53530845e-02 0.00000000e+00
 6.70516212e-03 2.38452965e-04 6.85714127e-04 4.15593293e-03
 1.68144312e-02 4.11561877e-02 1.99108776e-02 0.00000000e+00
 2.97926106e-02 7.72094866e-03 1.79327410e-02 2.15732604e-02
 1.34194776e-01 6.61028875e-03 3.49758076e-03 2.86811520e-03
 3.84719633e-02 1.52824568e-02 3.89033630e-02 3.15891136e-03
 7.18441280e-03 0.00000000e+00 6.79672584e-02 1.16183339e-02
 3.57914064e-03 5.30151231e-03 7.16186245e-04 0.00000000e+00
 1.04057882e-02 6.71041235e-02 5.93841635e-03 2.18805969e-02
 4.62530268e-04 1.01085067e-01 1.20778463e-03 1.07038552e-02
 7.07511278e-03 7.21452234e-04 2.36038677e-03 2.75023817e-03
 4.76739975e-03 5.98033406e-02 1.50098968e-02 6.81865728e-03
 9.49235074e-03 3.19438204e-02 3.3

## Splitting dataset into training & test set

In [6]:
X_train, X_test, y_train, y_test = train_test_split(embeddings, classes, test_size=0.10, random_state=42)

In [7]:
X, X_t, y, y_t = train_test_split(X_test, y_test, test_size=0.33, random_state=42)

In [8]:
print(len(y_train))
print(len(X_train))
print("------------------------")
print(len(y_test))
print(len(X_test))

print("------------------------ test ")

print(len(y))
print(len(X))
print("------------------------")
print(len(y_t))
print(len(X_t))

1850361
1850361
------------------------
205596
205596
------------------------ test 
137749
137749
------------------------
67847
67847


## Evaluation definition

In [9]:
def evaluate_network(network):
    print('---------------- ACCURACY -----------------------')
    print(network.score(X_t, y_t))
    print('---------------- AVERAGE PRECISION SCORE -----------------------')
    print(average_precision_score(y_t, network.predict_proba(X_t)[:,1]))
    print('---------------- AUC -----------------------')
    fpr, tpr, thresholds = metrics.roc_curve(y_t, network.predict_proba(X_t)[:,1]))
    print(metrics.auc(fpr, tpr))
    print('---------------------------------------')

## SVM - Support Vector Machines

In [10]:
svm = svm.SVC(max_iter=100, probability=True)
svm.fit(X, y)
evaluate_network(svm)



---------------- ACCURACY -----------------------
0.873789555912568
---------------- AVERAGE PRECISION SCORE -----------------------
0.30103274190197893
---------------- ROC -----------------------


NameError: name 'fpr' is not defined

## LR - Logistic Regression

In [None]:
lr = LogisticRegression(max_iter=100)
lr.fit(X, y)
evaluate_network(lr)

## MLP - Multi-Layer Perceptron

In [None]:
mlp = MLPClassifier(max_iter=10)
mlp.fit(X,y)
evaluate_network(mlp)