In [6]:
import os
import zarr
import numpy as np
import dask.array as da
import dask_image.ndfilters as dif
import dask_ml.cluster
import dask_ml.model_selection as dms
from dask_ml.wrappers import Incremental

In [2]:
training_data_path = "D:\\CapstoneData\\labelled\\data\\real_training"

In [3]:
files = os.listdir(training_data_path)
files = [file for file in files if file.endswith('.zarr') ]

labels = [file for file in files if file.endswith("CGT.zarr")]
images = [file for file in files if file not in labels]

images = sorted(images)
labels = sorted(labels)

X_y_files = []

for i in images:
    number = i.split("T")[1].split("_")[0]
    number = int(number)
    X_y_files.append([i, labels[number]])

In [4]:
X = []
y = []

for files in X_y_files:
    new_y = zarr.open(training_data_path + "\\" + files[1], mode='r')["volume"][:]
    y.append(new_y)

    X_path = training_data_path + "\\" + files[0]
    new_X = da.from_zarr(X_path, "volume")
    try:
        laplacian = da.from_zarr(X_path, "laplacian")
    except:
        laplacian = dif.gaussian_laplace(new_X, 2.5)
        da.to_zarr(laplacian, X_path, "laplacian")

    arrs = [laplacian, new_X]
    arrs = [arr.ravel() for arr in arrs]
    stacked = da.stack(arrs, -1)
    X.append(stacked)

y = da.from_array(np.array(y).flatten())
X = da.concatenate(X)

# Need to rechunk X for the train test split
X = X.rechunk({1: X.shape[1]})

# Make y match the X chunks
y = y.rechunk(X.chunks)

In [5]:
saved_data_path = "D:\\CapstoneData\\labelled\\data\\training.zarr"

da.to_zarr(X, saved_data_path, "X")
da.to_zarr(y, saved_data_path, "y")

In [2]:
# Shortcut to just load the data from zarr after everything above has been done
saved_data_path = "D:\\CapstoneData\\labelled\\data\\training.zarr"

X = da.from_zarr(saved_data_path, "X")
y = da.from_zarr(saved_data_path, "y")

In [13]:
def test_model(model, args=[], kwargs={}, train_size=0.0001, trials=5) -> list[int]:
    scores = []
    for i in range(trials):
        print(f"Run {i}")
        X_train, X_test, y_train, y_test = dms.train_test_split(X, y, train_size=train_size)
        m = model(*args, **kwargs)
        m = Incremental(m)
        try:
            m.fit(X_train, y_train, classes=[0,1])
        except:
            # Kmeans does not use classes
            m.fit(X_train, y_train)
        scores.append(m.score(X_test, y_test))

    return scores

In [15]:
from sklearn.linear_model import SGDClassifier
sgd_scores = test_model(SGDClassifier)
print(sgd_scores)


Run 0
Run 1
Run 2
Run 3
Run 4
[0.915117214888993, 0.9154836755237594, 0.9100087062903494, 0.9146144648068033, 0.8841088126278305]


In [14]:
from sklearn.cluster import MiniBatchKMeans
kmeans_scores = test_model(MiniBatchKMeans, kwargs={"n_clusters": 2})
print(kmeans_scores)

Run 0
Run 1
Run 2
Run 3
Run 4
[-942517419769856.0, -692646154600448.0, -699370194337792.0, -960529304649728.0, -710434164310016.0]
