# Description

It runs DBSCAN on the `umap` version of the data.

The notebook explores different values for `min_samples` and `eps` (the main parameters of DBSCAN).

# Environment variables

In [1]:
from IPython.display import display

import conf

N_JOBS = conf.GENERAL["N_JOBS"]
display(N_JOBS)

10

In [2]:
%env MKL_NUM_THREADS=$N_JOBS
%env OPEN_BLAS_NUM_THREADS=$N_JOBS
%env NUMEXPR_NUM_THREADS=$N_JOBS
%env OMP_NUM_THREADS=$N_JOBS

env: MKL_NUM_THREADS=10
env: OPEN_BLAS_NUM_THREADS=10
env: NUMEXPR_NUM_THREADS=10
env: OMP_NUM_THREADS=10


# Modules loading

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import pairwise_distances
from sklearn.cluster import DBSCAN
import matplotlib.pyplot as plt

from utils import generate_result_set_name
from clustering.ensembles.utils import generate_ensemble

# Global settings

In [5]:
np.random.seed(0)

In [6]:
CLUSTERING_ATTRIBUTES_TO_SAVE = ["n_clusters"]

In [7]:
NULL_DIR = conf.RESULTS["CLUSTERING_NULL_DIR"] / "shuffle_genes"

# Settings

In [8]:
# these parameter values are taken from the pre-analysis notebook for this clustering method and data version
k_values = np.arange(2, 125 + 1, 1)

eps_range_per_k = {
    k: (0.23, 0.80)
    if k < 5
    else (0.30, 0.80)
    if k < 10
    else (0.40, 2.50)
    if k < 20
    else (0.40, 7.00)
    if k < 50
    else (0.50, 10.00)
    for k in k_values
}

# Data version: umap

## Settings

In [9]:
INPUT_SUBSET = "umap"

In [10]:
INPUT_STEM = "z_score_std-projection-smultixcan-efo_partial-mashr-zscores"

In [11]:
# parameters of the dimentionality reduction steps
DR_OPTIONS = {
    "n_components": 50,
    "metric": "euclidean",
    "n_neighbors": 15,
    "random_state": 0,
}

In [12]:
input_filepath = Path(
    NULL_DIR,
    "data_transformations",
    INPUT_SUBSET,
    generate_result_set_name(
        DR_OPTIONS, prefix=f"{INPUT_SUBSET}-{INPUT_STEM}-", suffix=".pkl"
    ),
).resolve()
display(input_filepath)

assert input_filepath.exists(), "Input file does not exist"

input_filepath_stem = input_filepath.stem
display(input_filepath_stem)

PosixPath('/opt/data/results/clustering/null_sims/shuffle_genes/data_transformations/umap/umap-z_score_std-projection-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_50-n_neighbors_15-random_state_0.pkl')

'umap-z_score_std-projection-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_50-n_neighbors_15-random_state_0'

In [13]:
# output dir for this notebook
RESULTS_DIR = Path(
    NULL_DIR,
    "runs",
    f"{INPUT_SUBSET}-{INPUT_STEM}",
).resolve()
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

display(RESULTS_DIR)

PosixPath('/opt/data/results/clustering/null_sims/shuffle_genes/runs/umap-z_score_std-projection-smultixcan-efo_partial-mashr-zscores')

## Load input file

In [14]:
data = pd.read_pickle(input_filepath)

In [15]:
data.shape

(3752, 50)

In [16]:
data.head()

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5,UMAP6,UMAP7,UMAP8,UMAP9,UMAP10,...,UMAP41,UMAP42,UMAP43,UMAP44,UMAP45,UMAP46,UMAP47,UMAP48,UMAP49,UMAP50
100001_raw-Food_weight,3.507164,4.783988,2.721716,5.306452,3.881773,3.321169,4.960411,5.549141,4.84139,3.194715,...,9.187434,2.469095,1.234505,9.594598,0.333957,8.039461,5.482632,4.958725,2.801697,5.658304
100002_raw-Energy,3.606311,4.727725,2.704979,5.271563,3.724591,3.337485,4.781901,5.599291,4.791382,3.003243,...,9.121886,2.506307,1.227272,9.574743,0.344036,8.037237,5.477282,4.900286,2.729573,5.601664
100003_raw-Protein,3.594661,4.729409,2.682754,5.218021,3.652642,3.325793,4.771183,5.580573,4.820001,3.022622,...,9.097627,2.484138,1.221946,9.572356,0.339655,8.043584,5.472909,4.895183,2.765254,5.663905
100004_raw-Fat,3.459632,4.797582,2.755155,5.230875,3.78435,3.366962,4.942765,5.604582,4.731943,3.045358,...,9.124997,2.587374,1.199794,9.63343,0.286029,8.071218,5.476699,4.869335,2.766068,5.523112
100005_raw-Carbohydrate,3.436945,4.827942,2.761039,5.268426,4.005526,3.357875,5.036726,5.545567,4.842803,3.282768,...,9.181946,2.513265,1.200227,9.631493,0.349085,8.032649,5.486337,4.947569,2.811798,5.677997


In [17]:
assert not data.isna().any().any()

## Clustering

### Generate clusterers

In [18]:
CLUSTERING_OPTIONS = {}

# K_RANGE is the min_samples parameter in DBSCAN (sklearn)
CLUSTERING_OPTIONS["K_RANGE"] = k_values
CLUSTERING_OPTIONS["EPS_RANGE_PER_K"] = eps_range_per_k
CLUSTERING_OPTIONS["EPS_STEP"] = 33
CLUSTERING_OPTIONS["METRIC"] = "euclidean"

display(CLUSTERING_OPTIONS)

{'K_RANGE': array([  2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,
         15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,
         28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,
         41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,
         54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,
         67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,
         80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,
         93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104, 105,
        106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118,
        119, 120, 121, 122, 123, 124, 125]),
 'EPS_RANGE_PER_K': {2: (0.23, 0.8),
  3: (0.23, 0.8),
  4: (0.23, 0.8),
  5: (0.3, 0.8),
  6: (0.3, 0.8),
  7: (0.3, 0.8),
  8: (0.3, 0.8),
  9: (0.3, 0.8),
  10: (0.4, 2.5),
  11: (0.4, 2.5),
  12: (0.4, 2.5),
  13: (0.4, 2.5),
  14: (0.4, 2.5),
  15: (0.4, 2.5),
  16: (0.4, 2.5),
  1

In [19]:
CLUSTERERS = {}

idx = 0

for k in CLUSTERING_OPTIONS["K_RANGE"]:
    eps_range = CLUSTERING_OPTIONS["EPS_RANGE_PER_K"][k]
    eps_values = np.linspace(eps_range[0], eps_range[1], CLUSTERING_OPTIONS["EPS_STEP"])

    for eps in eps_values:
        clus = DBSCAN(min_samples=k, eps=eps, metric="precomputed", n_jobs=N_JOBS)

        method_name = type(clus).__name__
        CLUSTERERS[f"{method_name} #{idx}"] = clus

        idx = idx + 1

In [20]:
display(len(CLUSTERERS))

4092

In [21]:
_iter = iter(CLUSTERERS.items())
display(next(_iter))
display(next(_iter))

('DBSCAN #0', DBSCAN(eps=0.23, metric='precomputed', min_samples=2, n_jobs=10))

('DBSCAN #1',
 DBSCAN(eps=0.24781250000000002, metric='precomputed', min_samples=2, n_jobs=10))

In [22]:
clustering_method_name = method_name
display(clustering_method_name)

'DBSCAN'

### Generate ensemble

In [23]:
data_dist = pairwise_distances(data, metric=CLUSTERING_OPTIONS["METRIC"])

In [24]:
data_dist.shape

(3752, 3752)

In [25]:
pd.Series(data_dist.flatten()).describe().apply(str)

count            14077504.0
mean     3.0258994102478027
std       1.123066782951355
min                     0.0
25%      2.2383562326431274
50%      3.2953301668167114
75%      3.9533920884132385
max       5.173931121826172
dtype: object

In [26]:
ensemble = generate_ensemble(
    data_dist,
    CLUSTERERS,
    attributes=CLUSTERING_ATTRIBUTES_TO_SAVE,
)

100%|██████████| 4092/4092 [07:20<00:00,  9.28it/s]


In [27]:
ensemble.shape

(299, 3)

In [28]:
ensemble.head()

Unnamed: 0_level_0,clusterer_params,partition,n_clusters
clusterer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
DBSCAN #0,"{'algorithm': 'auto', 'eps': 0.23, 'leaf_size'...","[nan, 0.0, 0.0, nan, nan, 0.0, 0.0, 0.0, 0.0, ...",18
DBSCAN #1,"{'algorithm': 'auto', 'eps': 0.247812500000000...","[nan, 0.0, 0.0, nan, nan, 0.0, 0.0, 0.0, 0.0, ...",15
DBSCAN #2,"{'algorithm': 'auto', 'eps': 0.265625, 'leaf_s...","[0.0, 0.0, 0.0, nan, nan, 0.0, 0.0, 0.0, 0.0, ...",8
DBSCAN #3,"{'algorithm': 'auto', 'eps': 0.2834375, 'leaf_...","[0.0, 0.0, 0.0, nan, nan, 0.0, 0.0, 0.0, 0.0, ...",2
DBSCAN #4,"{'algorithm': 'auto', 'eps': 0.30125, 'leaf_si...","[0.0, 0.0, 0.0, nan, nan, 0.0, 0.0, 0.0, 0.0, ...",2


In [29]:
ensemble["n_clusters"].value_counts().head()

2    255
5      9
3      8
6      6
9      5
Name: n_clusters, dtype: int64

In [30]:
ensemble_stats = ensemble["n_clusters"].describe()
display(ensemble_stats)

count    299.000000
mean       2.612040
std        1.870826
min        2.000000
25%        2.000000
50%        2.000000
75%        2.000000
max       18.000000
Name: n_clusters, dtype: float64

In [31]:
assert (
    ensemble.shape[0] > 0
), "Ensemble is empty, stopping here (this is not actually an error if running null simulations)"

### Testing

In [32]:
assert ensemble_stats["min"] > 1

In [33]:
assert not ensemble["n_clusters"].isna().any()

In [34]:
# assert ensemble.shape[0] == len(CLUSTERERS)

In [35]:
# all partitions have the right size
assert np.all(
    [part["partition"].shape[0] == data.shape[0] for idx, part in ensemble.iterrows()]
)

In [36]:
# no partition has negative clusters (noisy points)
assert not np.any([(part["partition"] < 0).any() for idx, part in ensemble.iterrows()])

### Save

In [37]:
output_filename = Path(
    RESULTS_DIR,
    generate_result_set_name(
        {},
        #         CLUSTERING_OPTIONS,
        prefix=f"{clustering_method_name}",
        suffix=".pkl",
    ),
).resolve()
display(output_filename)

PosixPath('/opt/data/results/clustering/null_sims/shuffle_genes/runs/umap-z_score_std-projection-smultixcan-efo_partial-mashr-zscores/DBSCAN.pkl')

In [38]:
ensemble.to_pickle(output_filename)