# Практика по снижению размерности

In [None]:
import os
import sys

In [None]:
sys.path.append(os.path.join('..', '..'))

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import warnings

from matplotlib import rcParams
from mpl_toolkits.mplot3d import Axes3D
from sklearn import datasets
from sklearn.decomposition import PCA, KernelPCA
from sklearn.manifold import Isomap, LocallyLinearEmbedding, MDS, SpectralEmbedding, TSNE
from sklearn.preprocessing import StandardScaler, MinMaxScaler
# from umap import UMAP

from src.utils.common import get_data_folder, timeit

%matplotlib notebook
rcParams['font.size'] = 14
rcParams['figure.figsize'] = 7, 6

warnings.filterwarnings('ignore')

SEED = 3141952
np.random.seed(SEED)

In [None]:
X, scurve_labels = datasets.make_s_curve(1300, random_state=SEED)
df_scurve = pd.DataFrame(data = X, columns=['dim1', 'dim2', 'dim3'])

data_folder = get_data_folder()
df_flow = pd.read_csv(os.path.join(data_folder, 'flowcytometry', 'patient_1.csv'), index_col=0)
df_is = pd.read_csv(os.path.join(data_folder, 'df_is.csv'), index_col=0)

### Кратко о данных

* df_scruve -- стандартный пример для иллюстрации нелинейных техник
* df_flow -- с лекции по кластеризации
* df_is -- датасет с замерами иммунного статуса. Иммунный статус -- комплексная оценка иммунитета на основе исследования субпопуляций лимфоцитов. Значения *_rel == * / lymph * 100

In [None]:
scurve_cols = ['dim1', 'dim2', 'dim3']

In [None]:
flow_cols = [
    'FSC-A-', 'SSC-A-', 'FITC-A-CD25', 'PE-A-CD127',
    'PerCP-Cy5-5-A-CD4', 'PE-Cy7-A-', 'APC-A-', 'APC-Cy7-A-',
    'Pacific Blue-A-', 'AmCyan-A-'
]

In [None]:
is_cols = [
    'lymph',                  # лимфоциты
    't_lymph', 't_lymph_rel', # T-лимфоциты и производные
    't_help', 't_help_rel',
    'ctl', 'ctl_rel',
    'b_lymph', 'b_lymph_rel', # B-лимфоциты
    'nk', 'nk_rel',           # Натуральные киллеры
    'cd4_cd8'                 # Индекс на основе частного T-хелперов и цитоксических Т-лимфоцитов (ЦТЛ)
]
df_is = df_is[is_cols]

In [None]:
df_is.head()

In [None]:
df_is.describe()

In [None]:
df_is.isna().sum()

In [None]:
def scatterplot2d(df, col1, col2, 
                  labels=None, 
                  dots_size=6, palette='coolwarm'):
    fig, _ = plt.subplots()
    sns.scatterplot(df[col1], df[col2], hue=labels, s=dots_size, palette=palette)
    fig.canvas.draw()

def scatterplot3d(df, col1, col2, col3,
                  labels=None, 
                  dots_size=6, palette='coolwarm'):
    fig = plt.figure()
    ax = fig.add_subplot(111, projection = '3d')

    x, y, z = df[col1], df[col2], df[col3]
    
    ax.set_xlabel('dim1')
    ax.set_ylabel('dim2')
    ax.set_zlabel('dim3')
    
    if labels is not None and labels.dtype.type == np.str_:
        colors = 'rgbcmy'
        for i, g in enumerate(np.unique(labels)):
            ix = np.where(labels == g)
            ax.scatter(x.iloc[ix], y.iloc[ix], z.iloc[ix], c=colors[i], label=g)
        ax.legend()
    else:
        ax.scatter(x, y, z, c=labels, cmap=plt.cm.Spectral)
    plt.show()

### Preprocessing

#### S curve

In [None]:
df_scurve.describe()

In [None]:
# сделаем дыру в данных
scurve_mask = df_scurve['dim1'] ** 2 + (df_scurve['dim2'] - 1) ** 2 >= 0.2
df_scurve = df_scurve[scurve_mask]
scurve_labels = scurve_labels[scurve_mask]

In [None]:
scatterplot3d(df_scurve, 
              col1='dim1', col2='dim2', col3='dim3', 
              labels=scurve_labels)

In [None]:
df_scurve_scaled = df_scurve.copy()
scurve_scaler = StandardScaler()
df_scurve_scaled[scurve_cols] = scurve_scaler.fit_transform(df_scurve_scaled[scurve_cols])

In [None]:
scatterplot3d(df_scurve_scaled, 
              col1='dim1', col2='dim2', col3='dim3', 
              labels=scurve_labels)

#### Flowcytometry

In [None]:
# избавимся от части выбросов
flow_mask = (df_flow['FSC-A-'] > 200000) | (df_flow['SSC-A-'] > 240000)
df_flow = df_flow.drop(df_flow[flow_mask].index).reset_index()

In [None]:
# сэмплируем
df_flow = df_flow.sample(1000)

In [None]:
# разметка субпопуляций вручную
def find_flow_labels(df):
    conditions = [
        (df['FSC-A-'] < 40000) & (df['SSC-A-'] < 35000), 
        (df['FSC-A-'].between(35000, 100000)) & (df['SSC-A-'] < 50000),
        (df['FSC-A-'].between(75000, 150000)) & (df['SSC-A-'].between(50000, 90000)),
        (df['FSC-A-'].between(75000, 200000)) & (df['SSC-A-'].between(90000, 250000))
    ]
    values = ['debris', 'lymph', 'mono', 'other']
    return np.select(conditions, values, default='noise')
    
flow_labels = find_flow_labels(df_flow)

In [None]:
scatterplot2d(df_flow, col1=flow_cols[0], col2=flow_cols[1], labels=flow_labels)

In [None]:
df_flow_scaled = df_flow.copy()
flow_scaler = StandardScaler()
df_flow_scaled[flow_cols] = flow_scaler.fit_transform(df_flow_scaled[flow_cols])

#### IS

In [None]:
# сэмплируем
df_is = df_is.sample(1000)

In [None]:
# разметка субпопуляций вручную
def find_is_labels(df):
    conditions = [
        df['lymph'] <= 1.6, 
        df['lymph'].between(1.6, 2.4),
        df['lymph'] >= 2.4
    ]
    values = ['low', 'norm', 'hign']
    return np.select(conditions, values, default='noise')
    
is_labels = find_is_labels(df_is)

In [None]:
df_is_scaled = df_is.copy()
is_scaler = StandardScaler()
df_is_scaled[is_cols] = is_scaler.fit_transform(df_is_scaled[is_cols])

In [None]:
scatterplot2d(df_is_scaled, col1='t_lymph_rel', col2='lymph', labels=is_labels)

### Основные алгоритмы

#### PCA

* docs: https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html
* code: https://github.com/scikit-learn/scikit-learn/blob/95119c13a/sklearn/decomposition/_pca.py#L112
    * На основе SVD разложения: https://github.com/scikit-learn/scikit-learn/blob/95119c13a/sklearn/decomposition/_pca.py#L454

In [None]:
@timeit
def run_pca(df, labels=None, plot_type='2d', **kwargs):
    pca = PCA(**kwargs)
    pca_emb = pca.fit_transform(df)

    print('Объясненная дисперсия (кумулятивная):', *np.cumsum(pca.explained_variance_ratio_), sep='\n')
    print('Главные компоненты:', *pca.components_, sep='\n')
    
    if plot_type == '2d':
        df_pca_emb = pd.DataFrame({'dim1': pca_emb[:, 0], 'dim2': pca_emb[:, 1]})
        scatterplot2d(df_pca_emb, col1='dim1', col2='dim2', labels=labels)
    elif plot_type == '3d':
        df_pca_emb = pd.DataFrame({'dim1': pca_emb[:, 0], 'dim2': pca_emb[:, 1], 'dim3': pca_emb[:, 2]})
        scatterplot3d(df_pca_emb, col1='dim1', col2='dim2', col3='dim3', labels=labels)
    else:
        return None
    
    return df_pca_emb

In [None]:
run_pca(df_scurve_scaled, 
        labels=scurve_labels, 
        plot_type='2d', 
        n_components=2, 
        random_state=SEED);

In [None]:
run_pca(df_flow_scaled[flow_cols[:5]],  # df_flow_scaled[flow_cols[:2] + flow_cols[4:5]]
        labels=flow_labels, 
        plot_type='2d', 
        n_components=2, 
        random_state=SEED);

In [None]:
run_pca(df_is_scaled,
        labels=is_labels, 
        plot_type='2d', 
        n_components=2, 
        random_state=SEED);

#### MDS

* docs: https://scikit-learn.org/stable/modules/generated/sklearn.manifold.MDS.html#sklearn.manifold.MDS
* code: https://github.com/scikit-learn/scikit-learn/blob/95119c13a/sklearn/manifold/_mds.py#L279
    * smacof: https://github.com/scikit-learn/scikit-learn/blob/95119c13af77c76e150b753485c662b7c52a41a2/sklearn/manifold/_mds.py#L136
    * smacof_single: https://github.com/scikit-learn/scikit-learn/blob/95119c13af77c76e150b753485c662b7c52a41a2/sklearn/manifold/_mds.py#L22

In [None]:
@timeit
def run_mds(df, labels=None, plot_type='2d', **kwargs):
    mds = MDS(**kwargs)
    mds_emb = mds.fit_transform(df) 
    
    if plot_type == '2d':
        df_mds_emb = pd.DataFrame({'dim1': mds_emb[:, 0], 'dim2': mds_emb[:, 1]})
        scatterplot2d(df_mds_emb, col1='dim1', col2='dim2', labels=labels)
    elif plot_type == '3d':
        df_mds_emb = pd.DataFrame({'dim1': mds_emb[:, 0], 'dim2': mds_emb[:, 1], 'dim3': mds_emb[:, 2]})
        scatterplot3d(df_mds_emb, col1='dim1', col2='dim2', col3='dim3', labels=labels)
    else:
        return None
    
    return df_mds_emb

In [None]:
run_mds(df_scurve_scaled, 
        labels=scurve_labels, 
        plot_type='2d',
        n_components=2, 
        n_jobs=4);

In [None]:
run_mds(df_flow_scaled[flow_cols[:5]],  # df_flow_scaled[flow_cols[:2] + flow_cols[4:5]]
        labels=flow_labels, 
        plot_type='2d',
        n_components=3, 
        n_jobs=4);

In [None]:
run_mds(df_is_scaled, 
        labels=is_labels, 
        plot_type='2d',
        n_components=2, 
        n_jobs=4);

#### ISOMAP

* docs: https://scikit-learn.org/stable/modules/generated/sklearn.manifold.Isomap.html#sklearn.manifold.Isomap
* code: https://github.com/scikit-learn/scikit-learn/blob/95119c13a/sklearn/manifold/_isomap.py#L16
    * Вместо MDS загоняют матрицу геодезических расстояний в Kernel PCA, который, вообще говоря, эквивалентен classical MDS

In [None]:
@timeit
def run_isomap(df, labels=None, plot_type='2d', **kwargs):
    isomap = Isomap(**kwargs)
    isomap_emb = isomap.fit_transform(df) 
    
    if plot_type == '2d':
        df_isomap_emb = pd.DataFrame({'dim1': isomap_emb[:, 0], 'dim2': isomap_emb[:, 1]})
        scatterplot2d(df_isomap_emb, col1='dim1', col2='dim2', labels=labels)
    elif plot_type == '3d':
        df_isomap_emb = pd.DataFrame({'dim1': isomap_emb[:, 0], 'dim2': isomap_emb[:, 1], 'dim3': isomap_emb[:, 2]})
        scatterplot3d(df_isomap_emb, col1='dim1', col2='dim2', col3='dim3', labels=labels)
    else:
        return None
    
    return df_isomap_emb

In [None]:
run_isomap(df_scurve_scaled, 
           labels=scurve_labels, 
           plot_type='2d',
           n_neighbors=10,
           n_components=2, 
           n_jobs=4);

In [None]:
run_isomap(df_flow_scaled[flow_cols[:5]],  # df_flow_scaled[flow_cols[:2] + flow_cols[4:5]]
           labels=flow_labels, 
           plot_type='2d',
           n_neighbors=10,
           n_components=2, 
           n_jobs=4);

In [None]:
run_isomap(df_is_scaled, 
           labels=is_labels, 
           plot_type='2d',
           n_neighbors=10,
           n_components=2, 
           n_jobs=4);

#### LLE

* docs: https://scikit-learn.org/stable/modules/generated/sklearn.manifold.LocallyLinearEmbedding.html#sklearn.manifold.LocallyLinearEmbedding
* code: https://github.com/scikit-learn/scikit-learn/blob/95119c13a/sklearn/manifold/_locally_linear.py#L525
    * https://github.com/scikit-learn/scikit-learn/blob/95119c13af77c76e150b753485c662b7c52a41a2/sklearn/manifold/_locally_linear.py#L193

In [None]:
@timeit
def run_lle(df, labels=None, plot_type='2d', **kwargs):
    lle = LocallyLinearEmbedding(**kwargs)
    lle_emb = lle.fit_transform(df) 
    
    if plot_type == '2d':
        df_lle_emb = pd.DataFrame({'dim1': lle_emb[:, 0], 'dim2': lle_emb[:, 1]})
        scatterplot2d(df_lle_emb, col1='dim1', col2='dim2', labels=labels)
    elif plot_type == '3d':
        df_lle_emb = pd.DataFrame({'dim1': lle_emb[:, 0], 'dim2': lle_emb[:, 1], 'dim3': lle_emb[:, 2]})
        scatterplot3d(df_lle_emb, col1='dim1', col2='dim2', col3='dim3', labels=labels)
    else:
        return None
    
    return df_lle_emb

In [None]:
run_lle(df_scurve_scaled, 
        labels=scurve_labels, 
        plot_type='2d',
        n_neighbors=10,
        n_components=2, 
        random_state=SEED,
        n_jobs=4);

In [None]:
run_lle(df_flow_scaled[flow_cols[:5]],  # df_flow_scaled[flow_cols[:2] + flow_cols[4:5]]
        labels=flow_labels, 
        plot_type='2d',
        n_neighbors=15,
        n_components=2, 
        random_state=SEED,
        n_jobs=4);

In [None]:
run_lle(df_is_scaled, 
        labels=is_labels, 
        plot_type='2d',
        n_neighbors=10,
        n_components=2, 
        random_state=SEED,
        n_jobs=4);

#### Laplacian eigenmaps (Spectral embedding)

* docs: https://scikit-learn.org/stable/modules/generated/sklearn.manifold.SpectralEmbedding.html#sklearn.manifold.SpectralEmbedding
* code: https://github.com/scikit-learn/scikit-learn/blob/95119c13a/sklearn/manifold/_spectral_embedding.py#L363
    * spectral_embedding: https://github.com/scikit-learn/scikit-learn/blob/95119c13af77c76e150b753485c662b7c52a41a2/sklearn/manifold/_spectral_embedding.py#L145
    * На основе scipy.sparse.csgraph.laplacian: https://github.com/scipy/scipy/blob/v1.6.1/scipy/sparse/csgraph/_laplacian.py#L16-L79

In [None]:
@timeit
def run_lap_eig(df, labels=None, plot_type='2d', **kwargs):
    lap_eig = SpectralEmbedding(**kwargs)
    lap_eig_emb = lap_eig.fit_transform(df) 
    
    if plot_type == '2d':
        df_lap_eig_emb = pd.DataFrame({'dim1': lap_eig_emb[:, 0], 'dim2': lap_eig_emb[:, 1]})
        scatterplot2d(df_lap_eig_emb, col1='dim1', col2='dim2', labels=labels)
    elif plot_type == '3d':
        df_lap_eig_emb = pd.DataFrame({'dim1': lap_eig_emb[:, 0], 'dim2': lap_eig_emb[:, 1], 'dim3': lap_eig_emb[:, 2]})
        scatterplot3d(df_lap_eig_emb, col1='dim1', col2='dim2', col3='dim3', labels=labels)
    else:
        return None
    
    return df_lap_eig_emb

In [None]:
run_lap_eig(df_scurve_scaled, 
            labels=scurve_labels, 
            plot_type='2d',
            n_neighbors=10,
            n_components=4, 
            random_state=SEED,
            n_jobs=4);

In [None]:
run_lap_eig(df_flow_scaled[flow_cols[:5]], # df_flow_scaled[flow_cols[:2] + flow_cols[4:5]]
            labels=flow_labels, 
            plot_type='2d',
            n_neighbors=10,
            n_components=2, 
            random_state=SEED,
            n_jobs=4);

In [None]:
run_lap_eig(df_is_scaled, 
            labels=is_labels, 
            plot_type='2d',
            n_neighbors=10,
            n_components=2, 
            random_state=SEED,
            n_jobs=4);