In [1]:
import pickle
import sys
from collections import defaultdict
from pathlib import Path

import cv2
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import parc
import scipy.cluster.hierarchy as sch
import seaborn as sns
from matplotlib.colors import LogNorm
from mpl_toolkits.axes_grid1.inset_locator import mark_inset, zoomed_inset_axes
from skimage import measure, exposure
import skimage.io
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from typing import List
import dask.dataframe as dd
import dask_ml.cluster


In [2]:
%load_ext autoreload
%autoreload 2

In [4]:
# Import path
module_path = str(Path.cwd().parents[0])
if module_path not in sys.path:
    sys.path.append(module_path)

module_path = str(Path.cwd().parents[0] / "src")

if module_path not in sys.path:
    sys.path.append(module_path)

In [5]:
import utils as utils
from config import *

In [6]:
# Parameters
min_intensity = 0.4
random_seed = 1

# Read data

In [7]:
df_pixels = pd.read_csv(data_meta / 'pixel_intensity_3D.csv')

In [8]:
df_pixels.head()

Unnamed: 0,p-EGFR,Non-p-B-catenin,APC,DKK1,Cyclin E,EMMPRIN,Wnt1,p-AKT,p-B-catenin,RNF 43,...,EGFR,Cyclin D1,B-tubin,H3k27me3-488,H3k9Ac-555,H3k4me3-647,X,Y,Id,Z
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,0,0,0,24,2,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,0,0,0,25,2,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3733.0,...,0.0,0.0,0.0,0,0,0,0,26,2,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,0,0,0,27,2,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,0,0,0,28,2,1


In [9]:
pixels = df_pixels.iloc[:, :-4]
# pixels = pixels.drop(columns=['Concanavalin A', 'Phalloidin', 'Hoeschst', 'WGA', 'B-actin', 'PCNA'])

pixels.head()

Unnamed: 0,p-EGFR,Non-p-B-catenin,APC,DKK1,Cyclin E,EMMPRIN,Wnt1,p-AKT,p-B-catenin,RNF 43,...,p-mTOR,mTOR,DKK2,AXIN1,EGFR,Cyclin D1,B-tubin,H3k27me3-488,H3k9Ac-555,H3k4me3-647
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3733.0,...,0.0,1914.0,0.0,0.0,0.0,0.0,0.0,0,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0


In [10]:
# Scale data
scaler = MinMaxScaler()
x_scaled = scaler.fit_transform(pixels)
pixels_scaled = pd.DataFrame(x_scaled, columns=pixels.columns)

In [11]:
pixel_dark = pixels_scaled.le(min_intensity).all(axis=1)
display(pixel_dark.value_counts())
pixels_bright = pixels_scaled[~pixel_dark]
display(pixels_bright.head())

False    21448818
True      7906239
dtype: int64

Unnamed: 0,p-EGFR,Non-p-B-catenin,APC,DKK1,Cyclin E,EMMPRIN,Wnt1,p-AKT,p-B-catenin,RNF 43,...,p-mTOR,mTOR,DKK2,AXIN1,EGFR,Cyclin D1,B-tubin,H3k27me3-488,H3k9Ac-555,H3k4me3-647
67,0.0,0.0,0.262913,0.039445,0.0,0.0,0.0,0.033051,0.0,0.0,...,0.0,0.453925,0.0,0.0,0.0,0.0,0.0,0.080232,0.0,0.054536
92,0.046082,0.152102,0.0,0.0,0.0,0.0,0.0,0.094865,0.0,0.0,...,0.0,0.0,0.0,0.0,0.529625,0.0,0.0,0.176516,0.0,0.0
98,0.0,0.306325,0.497475,0.122911,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.084459,0.0,0.056184
100,0.0,0.0,0.714366,0.0,0.0,0.060288,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.059953,0.0,0.042969
101,0.0,0.0,0.417395,0.062211,0.0,0.0,0.0,0.034012,0.0,0.063279,...,0.0,0.022461,0.106279,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
X = dd.from_pandas(pixels_bright, chunksize=100000)

In [13]:
X = X.persist()
X

Unnamed: 0_level_0,p-EGFR,Non-p-B-catenin,APC,DKK1,Cyclin E,EMMPRIN,Wnt1,p-AKT,p-B-catenin,RNF 43,AKT,p-mTOR,mTOR,DKK2,AXIN1,EGFR,Cyclin D1,B-tubin,H3k27me3-488,H3k9Ac-555,H3k4me3-647
npartitions=215,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
67,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64
189153,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29278346,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29355056,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [14]:
km = dask_ml.cluster.KMeans(n_clusters=60, oversampling_factor=10)
km.fit(X)

KMeans(n_clusters=60, oversampling_factor=10)

In [15]:
labels = km.labels_

In [16]:
labels = labels.compute()

In [17]:
len(labels)

21448818

In [18]:
len(X)

21448818

In [19]:
with open(data_meta / f"kmeans_labels_3D.pickle", "wb") as f:
    pickle.dump([labels, pixels_bright], f)