In [2]:
import pickle
import sys
from collections import defaultdict
from pathlib import Path

import cv2
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.cluster.hierarchy as sch
import seaborn as sns
from matplotlib.colors import LogNorm
from mpl_toolkits.axes_grid1.inset_locator import mark_inset, zoomed_inset_axes
from skimage import measure, exposure
import skimage.io
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from typing import List
import dask.dataframe as dd
import dask_ml.cluster


In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
# Import path
module_path = str(Path.cwd().parents[0])
if module_path not in sys.path:
    sys.path.append(module_path)

module_path = str(Path.cwd().parents[0] / "src")

if module_path not in sys.path:
    sys.path.append(module_path)

In [5]:
import utils as utils
from config import *

In [6]:
csv_file = data_meta / "info_combined.csv"

# Read dataframe containing images information
df = pd.read_csv(csv_file)

In [7]:
# Parameters
min_intensity = 0.3
random_seed = 1

# Read data

In [8]:
df_pixels = pd.read_csv(data_meta / 'pixel_intensity.csv')

In [9]:
df_pixels.head()

Unnamed: 0,Hoeschst,p-EGFR,Concanavalin A,PCNA,B-actin,Phalloidin,WGA,APC,DKK1,Non-p-B-catenin,...,EGFR,B-tubin,Cyclin D1,H3k27me3-488,H3k4me3-647,H3k9Ac-555,Location,X,Y,Id
0,0,0.041269,0.0,0.109774,0.0,0.0,0.0,0.003882,0.0,0.0,...,0.148699,0.110687,0.051846,0.0,0.0,0.10557,10,18,1086,6
1,0,0.0,0.0,0.043609,0.0,0.0,0.0,0.007151,0.010676,0.0,...,0.126394,0.0,0.003928,0.002201,0.047569,0.123705,10,19,1086,6
2,0,0.0,0.0,0.0,0.00822,0.0,0.0,0.110427,0.0,0.0,...,0.223048,0.0,0.071485,0.0,0.072138,0.061528,10,19,1087,6
3,0,0.016371,0.0,0.019549,0.011009,0.0,0.0,0.121868,0.0,0.0,...,0.275093,0.0,0.120974,0.0,0.029273,0.029145,10,19,1088,6
4,0,0.0,0.0,0.192481,0.0,0.0,0.0,0.018796,0.01637,0.0,...,0.252788,0.168893,0.0,0.0,0.046001,0.135363,10,20,1085,6


In [10]:
pixels = df_pixels.iloc[:, :-4]
pixels = pixels.drop(columns=['Concanavalin A', 'Phalloidin', 'Hoeschst', 'WGA', 'B-actin', 'PCNA'])

pixels.head()

Unnamed: 0,p-EGFR,APC,DKK1,Non-p-B-catenin,Cyclin E,EMMPRIN,Wnt1,RNF 43,p-AKT,p-B-catenin,...,mTOR,p-mTOR,AXIN1,DKK2,EGFR,B-tubin,Cyclin D1,H3k27me3-488,H3k4me3-647,H3k9Ac-555
0,0.041269,0.003882,0.0,0.0,0.011399,0.0,0.0,0.037787,0.035695,0.079777,...,0.0,0.0,0.048759,0.0,0.148699,0.110687,0.051846,0.0,0.0,0.10557
1,0.0,0.007151,0.010676,0.0,0.020605,0.0,0.0,0.0,0.0,0.04731,...,0.002365,0.0,0.143666,0.007874,0.126394,0.0,0.003928,0.002201,0.047569,0.123705
2,0.0,0.110427,0.0,0.0,0.0,0.0,0.0,0.048583,0.0,0.1141,...,0.081595,0.102069,0.0,0.124672,0.223048,0.0,0.071485,0.0,0.072138,0.061528
3,0.016371,0.121868,0.0,0.0,0.011399,0.021042,0.0,0.145749,0.0,0.020408,...,0.06031,0.213967,0.0,0.040682,0.275093,0.0,0.120974,0.0,0.029273,0.029145
4,0.0,0.018796,0.01637,0.0,0.0,0.0,0.0,0.107962,0.0,0.000928,...,0.043754,0.043096,0.106661,0.041995,0.252788,0.168893,0.0,0.0,0.046001,0.135363


In [11]:
# Scale data
# scaler = MinMaxScaler()
# x_scaled = scaler.fit_transform(pixels)
# pixels_scaled = pd.DataFrame(x_scaled, columns=pixels.columns)
pixels_scaled = pixels

In [12]:
pixel_dark = pixels_scaled.le(min_intensity).all(axis=1)
display(pixel_dark.value_counts())
pixels_bright = pixels_scaled[~pixel_dark]
display(pixels_bright.head())

False    52981896
True     22842961
dtype: int64

Unnamed: 0,p-EGFR,APC,DKK1,Non-p-B-catenin,Cyclin E,EMMPRIN,Wnt1,RNF 43,p-AKT,p-B-catenin,...,mTOR,p-mTOR,AXIN1,DKK2,EGFR,B-tubin,Cyclin D1,H3k27me3-488,H3k4me3-647,H3k9Ac-555
7,0.0,0.662358,0.012811,0.0,0.0,0.0,0.0,0.059379,0.0,0.139147,...,0.086326,0.008317,0.032216,0.005249,0.042751,0.081107,0.034564,0.0,0.010455,0.026554
8,0.068213,0.398191,0.027758,0.0,0.0,0.0,0.0,0.021592,0.020319,0.080705,...,0.0,0.046876,0.067915,0.139108,0.223048,0.043893,0.000786,0.0,0.0,0.0
13,0.011255,0.358862,0.061922,0.001361,0.0,0.006554,0.0,0.0,0.019769,0.0,...,0.0,0.06351,0.0,0.167979,0.198885,0.056298,0.0,0.000367,0.03241,0.044689
24,0.000341,0.02564,0.058363,0.0,0.079351,0.0,0.064846,0.026991,0.0,0.0,...,0.0,0.0,0.0,0.14042,0.042751,0.064885,0.0,0.0,0.039728,0.020725
26,0.0,0.0,0.017082,0.0,0.022359,0.0,0.070697,0.067476,0.0,0.207792,...,0.020103,0.149701,0.030475,0.015748,0.092937,0.0,0.382561,0.0,0.0,0.027202


In [13]:
X = dd.from_pandas(pixels_bright, chunksize=100000)

In [14]:
X = X.persist()
X

Unnamed: 0_level_0,p-EGFR,APC,DKK1,Non-p-B-catenin,Cyclin E,EMMPRIN,Wnt1,RNF 43,p-AKT,p-B-catenin,AKT,mTOR,p-mTOR,AXIN1,DKK2,EGFR,B-tubin,Cyclin D1,H3k27me3-488,H3k4me3-647,H3k9Ac-555
npartitions=530,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64
125608,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75670971,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75824855,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [15]:
km = dask_ml.cluster.KMeans(n_clusters=60, oversampling_factor=10)
km.fit(X)

KMeans(n_clusters=60, oversampling_factor=10)

In [16]:
labels = km.labels_

In [17]:
labels = labels.compute()

In [18]:
len(labels)

52981896

In [19]:
len(X)

52981896

In [20]:
with open(data_meta / f"kmeans_labels_new.pickle", "wb") as f:
    pickle.dump([labels, pixels_bright], f)