# Практика по кластеризации

In [None]:
import ipywidgets as widgets
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
import warnings

from IPython.display import display
from functools import lru_cache
from ipywidgets import interact, fixed, IntSlider, FloatSlider
from sklearn.base import TransformerMixin
from sklearn.cluster import (MeanShift, AgglomerativeClustering, DBSCAN,
                             MiniBatchKMeans, KMeans, 
                             SpectralClustering)
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [None]:
os.chdir(os.path.join('..', '..'))

In [None]:
from definitions import DATA_DIR
from src.utils import timeit

In [None]:
%matplotlib inline
warnings.filterwarnings('ignore')

SEED = 5
np.random.seed(SEED)

### Используемые данные.
Проточная цитометрия — метод исследования дисперсных сред в режиме поштучного анализа элементов дисперсной фазы по сигналам светорассеяния и флуоресценции. Название метода связано с основным приложением, а именно, с исследованием одиночных биологических клеток в потоке.
<img src="../../misc/cytometry.png" width="680"/>

In [None]:
dfs = [pd.read_csv(os.path.join(DATA_DIR, 'flowcytometry', file_name)) 
       for file_name in os.listdir(os.path.join(DATA_DIR, 'flowcytometry'))]
df = dfs[0]

In [None]:
df.describe()

In [None]:
for ind, df in enumerate(dfs):
    print(f'Patient {ind + 1}:', df.isnull().any().sum())

In [None]:
fig, ax = plt.subplots(figsize=(10, 9))
sns.heatmap(df.corr(),cmap='coolwarm', ax=ax, annot=True, linewidths=2)

In [None]:
fig, ax = plt.subplots(figsize=(9, 9))
sns.scatterplot(df['FSC-A-'], df['SSC-A-'])
fig.canvas.draw()

### Кластеризация

In [None]:
clustering = {
    'kmeans': {'method': KMeans, 
               'params_range': {'n_clusters': IntSlider(min=2, max=10), 
                                'random_state': fixed(SEED)}},
    'dbscan': {'method': DBSCAN, 
               'params_range': {'eps': FloatSlider(min=0.05, max=1.5, step=0.05), 
                                'min_samples': IntSlider(min=1, max=25)}}
}

method_name = 'dbscan'
params_range = clustering[method_name]['params_range']
X = dfs[5][['FSC-A-', 'SSC-A-']]

In [None]:
class InteractiveClusterer:
    def __init__(self, method: str, params_range: dict, 
                 X: pd.DataFrame, 
                 scaler: TransformerMixin = None):
        self.method = method
        self.clusterer = None
        self.params_range = params_range
        self.X = X
        
        if scaler is not None:
            X[X.columns] = scaler.fit_transform(X)
    
    @lru_cache(maxsize=None)
    def fit(self, **kwargs):
        self.clusterer = self.method(**kwargs)
        self.clusterer.fit(self.X)
        
        return self.clusterer.labels_
    
    def plot(self, **kwargs):
        labels = self.fit(**kwargs)
        fig, ax = plt.subplots(figsize=(9, 9))
        sns.scatterplot(self.X['FSC-A-'], self.X['SSC-A-'], labels)
        fig.canvas.draw()

In [None]:
scaler = StandardScaler(with_mean=True, with_std=True)  #MinMaxScaler()
clusterer = InteractiveClusterer(**clustering[method_name], 
                                 X=X, 
                                 scaler=scaler)

interact(clusterer.plot, **params_range)