# Практика по кластеризации

In [None]:
import ipywidgets as widgets
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
import warnings

from abc import ABCMeta
from IPython.display import display
from functools import lru_cache
from ipywidgets import interact, fixed, IntSlider, FloatSlider
from matplotlib import rcParams
from sklearn.base import TransformerMixin
from sklearn.cluster import (MeanShift, AgglomerativeClustering, DBSCAN,
                             MiniBatchKMeans, KMeans, 
                             SpectralClustering)
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from typing import List

In [None]:
os.chdir(os.path.join('..', '..'))

In [None]:
from definitions import DATA_DIR
from src.utils.timer import timeit

In [None]:
%matplotlib inline
rcParams['font.size'] = 14
rcParams['figure.figsize'] = 11, 9

warnings.filterwarnings('ignore')

SEED = 5
np.random.seed(SEED)

### Используемые данные.
Проточная цитометрия — метод исследования дисперсных сред в режиме поштучного анализа элементов дисперсной фазы по сигналам светорассеяния и флуоресценции. Название метода связано с основным приложением, а именно, с исследованием одиночных биологических клеток в потоке.
<img src="../../misc/cytometry.png" width="680"/>

In [None]:
dfs = [pd.read_csv(os.path.join(DATA_DIR, 'flowcytometry', file_name)) 
       for file_name in os.listdir(os.path.join(DATA_DIR, 'flowcytometry'))]

In [None]:
dfs[0].describe()

In [None]:
for ind, df in enumerate(dfs):
    print(f'Patient {ind + 1}:', df.isnull().any().sum())

In [None]:
cols = dfs[0].columns

In [None]:
def scatterplot2d(df, col1='FSC-A-', col2='SSC-A-', labels=None, 
                  dots_size=6, palette="coolwarm"):
    fig, _ = plt.subplots()
    sns.scatterplot(df[col1], df[col2], hue=labels, s=dots_size, palette = "coolwarm")
    fig.canvas.draw()

In [None]:
scatterplot2d(dfs[0], cols[1], cols[2])

### Кластеризация

In [None]:
clustering = {
    'meanshift': {'method': MeanShift, 
                  'params_range': {'bandwidth': list(np.arange(0.3, 1.5, 0.05)) + [None], 
                                   'bin_seeding': [True, False], 
                                   'n_jobs': [*range(1, 5), -1]}
                 }, 
    'agglomerative': {'method': AgglomerativeClustering, 
                      'params_range': {'n_clusters': [*range(2, 50)], 
                                       'affinity': ['euclidean', 'manhattan'], 
                                       'linkage': ['ward', 'complete', 'average', 'single']}}, 
    'dbscan': {'method': DBSCAN, 
               'params_range': {'eps': [*np.arange(0.01, 0.2, 0.01)], 
                                'min_samples': [*range(1, 26)], 
                                'metric': ['euclidean', 'manhattan'],
                                'n_jobs': [*range(1, 5), -1]}},
    'em': {'method': GaussianMixture, 
           'params_range': {'n_components': [*range(2, 50)], 
                            'covariance_type': ['full', 'tied', 'diag', 'spherical'],
                            'n_init': [*range(1, 6)],
                            'init_params': ['kmeans', 'random'],
                            'random_state': fixed(SEED)}}, 
    'kmeans': {'method': KMeans, 
               'params_range': {'n_clusters': [*range(2, 50)],
                                'n_init': [*range(5, 25)],
                                'random_state': fixed(SEED), 
                                'n_jobs': [*range(1, 5), -1]}},
    'mbkmeans': {'method': MiniBatchKMeans, 
                 'params_range': {'n_clusters': [*range(2, 50)], 
                                  'batch_size': [*range(100, 1001, 100)],
                                  'n_init': [*range(3, 8)],
                                  'random_state': fixed(SEED)}},
    'spectral': {'method': SpectralClustering, 
                 'params_range': {'n_clusters': [*range(2, 50)], 
                                  'n_components': [*range(2, 50)],
                                  'affinity': ['nearest_neighbors', 'rbf'], 
                                  'gamma': [*np.arange(0.5, 2, 0.1)],
                                  'n_neighbors': [*range(2, 25)],                         
                                  'assign_labels': ['kmeans', 'discretize'], 
                                  'n_init': [*range(10, 25)],
                                  'random_state': fixed(SEED), 
                                  'n_jobs': [*range(1, 5), -1]}}
}

In [None]:
class InteractiveClusterer:
    def __init__(self, method: str, params_range: dict, 
                 dfs: List[pd.DataFrame], 
                 scaler: TransformerMixin = MinMaxScaler()):
        self.method = method
        self.clusterer = None
        self.params_range = params_range
        self.dfs = dfs
        self.curr_df = None
        
    @lru_cache(maxsize=None)
    def fit_predict(self, 
            # нужны для кэширования результатов
            patient=0, col1='FSC-A-', col2='SSC-A-', do_scaling=False, 
            **kwargs):           
        self.clusterer = self.method(**kwargs)
        self.clusterer.fit(self.curr_df)
        
        # для случая GMM
        if not isinstance(self.method, ABCMeta):
            return self.clusterer.labels_  
        else:
            return self.clusterer.predict(self.curr_df)
    
    def analysis2d(self, print_clust_num=False, dots_size=5, palette = 'coolwarm', 
                   patient=0, col1='FSC-A-', col2='SSC-A-', do_scaling=False, 
                   plot_scaled=False, **kwargs):
        self.curr_df = self.dfs[patient][[col1, col2]].copy()
        
        if do_scaling:
            self.curr_df[self.curr_df.columns] = scaler.fit_transform(self.curr_df)
        
        labels = self.fit_predict(patient=patient, col1=col1, col2=col2, do_scaling=do_scaling, 
                                  **kwargs)

        if print_clust_num:
            print('Число кластеров:', len(set(labels)))
        
        scatterplot2d(self.curr_df if plot_scaled else self.dfs[patient], 
                      col1=col1, col2=col2, labels=labels)

In [None]:
method_name = 'dbscan'
params_range = clustering[method_name]['params_range']

In [None]:
scaler = StandardScaler()  #MinMaxScaler()
clusterer = InteractiveClusterer(**clustering[method_name], 
                                 dfs=dfs, 
                                 scaler=scaler)

In [None]:
interact(clusterer.analysis2d, 
         print_clust_num=True, 
         dots_size=[*range(1, 15)], palette='coolwarm', 
         patient=[*range(0, 5)], col1=cols, col2=cols,
         do_scaling=[False, True], plot_scaled=[False, True],
         **params_range)