# Description

It runs DBSCAN on the `umap` version of the data.

The notebook explores different values for `min_samples` and `eps` (the main parameters of DBSCAN).

# Environment variables

In [1]:
from IPython.display import display

import conf

N_JOBS = conf.GENERAL["N_JOBS"]
display(N_JOBS)

10

In [2]:
%env MKL_NUM_THREADS=$N_JOBS
%env OPEN_BLAS_NUM_THREADS=$N_JOBS
%env NUMEXPR_NUM_THREADS=$N_JOBS
%env OMP_NUM_THREADS=$N_JOBS

env: MKL_NUM_THREADS=10
env: OPEN_BLAS_NUM_THREADS=10
env: NUMEXPR_NUM_THREADS=10
env: OMP_NUM_THREADS=10


# Modules loading

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import pairwise_distances
from sklearn.cluster import DBSCAN
import matplotlib.pyplot as plt

from utils import generate_result_set_name
from clustering.ensembles.utils import generate_ensemble

# Global settings

In [5]:
np.random.seed(0)

In [6]:
CLUSTERING_ATTRIBUTES_TO_SAVE = ["n_clusters"]

In [7]:
NULL_DIR = conf.RESULTS["CLUSTERING_NULL_DIR"] / "shuffle_lvs"

# Settings

In [8]:
# these parameter values are taken from the pre-analysis notebook for this clustering method and data version
k_values = np.arange(2, 125 + 1, 1)

eps_range_per_k = {
    k: (0.23, 0.80)
    if k < 5
    else (0.30, 0.80)
    if k < 10
    else (0.40, 2.50)
    if k < 20
    else (0.40, 7.00)
    if k < 50
    else (0.50, 10.00)
    for k in k_values
}

# Data version: umap

## Settings

In [9]:
INPUT_SUBSET = "umap"

In [10]:
INPUT_STEM = "z_score_std-projection-smultixcan-efo_partial-mashr-zscores"

In [11]:
# parameters of the dimentionality reduction steps
DR_OPTIONS = {
    "n_components": 50,
    "metric": "euclidean",
    "n_neighbors": 15,
    "random_state": 0,
}

In [12]:
input_filepath = Path(
    NULL_DIR,
    "data_transformations",
    INPUT_SUBSET,
    generate_result_set_name(
        DR_OPTIONS, prefix=f"{INPUT_SUBSET}-{INPUT_STEM}-", suffix=".pkl"
    ),
).resolve()
display(input_filepath)

assert input_filepath.exists(), "Input file does not exist"

input_filepath_stem = input_filepath.stem
display(input_filepath_stem)

PosixPath('/opt/data/results/clustering/null_sims/shuffle_lvs/data_transformations/umap/umap-z_score_std-projection-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_50-n_neighbors_15-random_state_0.pkl')

'umap-z_score_std-projection-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_50-n_neighbors_15-random_state_0'

In [13]:
# output dir for this notebook
RESULTS_DIR = Path(
    NULL_DIR,
    "runs",
    f"{INPUT_SUBSET}-{INPUT_STEM}",
).resolve()
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

display(RESULTS_DIR)

PosixPath('/opt/data/results/clustering/null_sims/shuffle_lvs/runs/umap-z_score_std-projection-smultixcan-efo_partial-mashr-zscores')

## Load input file

In [14]:
data = pd.read_pickle(input_filepath)

In [15]:
data.shape

(3752, 50)

In [16]:
data.head()

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5,UMAP6,UMAP7,UMAP8,UMAP9,UMAP10,...,UMAP41,UMAP42,UMAP43,UMAP44,UMAP45,UMAP46,UMAP47,UMAP48,UMAP49,UMAP50
100001_raw-Food_weight,7.348012,4.577476,4.003544,7.175258,4.628082,4.668375,4.184439,5.524602,5.236019,4.05196,...,7.420596,8.645991,8.978471,0.844213,0.445817,8.147386,0.880679,9.326646,5.905894,9.498857
100002_raw-Energy,7.310478,4.624479,4.003817,7.205625,4.674837,4.623739,4.303133,5.513216,5.319317,3.911008,...,7.434267,8.683359,8.898481,0.940067,0.438958,8.215672,0.835403,9.336117,5.906696,9.496916
100003_raw-Protein,7.37212,4.567552,3.956281,7.172141,4.637493,4.662377,4.164535,5.569874,5.245649,3.992713,...,7.414976,8.643587,8.971425,0.86641,0.448116,8.160679,0.888818,9.330538,5.882386,9.454537
100004_raw-Fat,7.246791,4.422901,3.999165,6.995014,4.689552,4.778102,4.28379,5.544029,5.27334,4.024833,...,7.451052,8.683681,8.907457,0.842343,0.455955,8.174278,0.874224,9.339664,5.891949,9.500463
100005_raw-Carbohydrate,7.340907,4.559629,4.066139,7.116928,4.623105,4.678367,4.169051,5.549784,5.156203,4.129416,...,7.46093,8.663488,8.955309,0.796839,0.432522,8.127192,0.87322,9.321359,5.902341,9.528457


In [17]:
assert not data.isna().any().any()

## Clustering

### Generate clusterers

In [18]:
CLUSTERING_OPTIONS = {}

# K_RANGE is the min_samples parameter in DBSCAN (sklearn)
CLUSTERING_OPTIONS["K_RANGE"] = k_values
CLUSTERING_OPTIONS["EPS_RANGE_PER_K"] = eps_range_per_k
CLUSTERING_OPTIONS["EPS_STEP"] = 33
CLUSTERING_OPTIONS["METRIC"] = "euclidean"

display(CLUSTERING_OPTIONS)

{'K_RANGE': array([  2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,
         15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,
         28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,
         41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,
         54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,
         67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,
         80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,
         93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104, 105,
        106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118,
        119, 120, 121, 122, 123, 124, 125]),
 'EPS_RANGE_PER_K': {2: (0.23, 0.8),
  3: (0.23, 0.8),
  4: (0.23, 0.8),
  5: (0.3, 0.8),
  6: (0.3, 0.8),
  7: (0.3, 0.8),
  8: (0.3, 0.8),
  9: (0.3, 0.8),
  10: (0.4, 2.5),
  11: (0.4, 2.5),
  12: (0.4, 2.5),
  13: (0.4, 2.5),
  14: (0.4, 2.5),
  15: (0.4, 2.5),
  16: (0.4, 2.5),
  1

In [19]:
CLUSTERERS = {}

idx = 0

for k in CLUSTERING_OPTIONS["K_RANGE"]:
    eps_range = CLUSTERING_OPTIONS["EPS_RANGE_PER_K"][k]
    eps_values = np.linspace(eps_range[0], eps_range[1], CLUSTERING_OPTIONS["EPS_STEP"])

    for eps in eps_values:
        clus = DBSCAN(min_samples=k, eps=eps, metric="precomputed", n_jobs=N_JOBS)

        method_name = type(clus).__name__
        CLUSTERERS[f"{method_name} #{idx}"] = clus

        idx = idx + 1

In [20]:
display(len(CLUSTERERS))

4092

In [21]:
_iter = iter(CLUSTERERS.items())
display(next(_iter))
display(next(_iter))

('DBSCAN #0', DBSCAN(eps=0.23, metric='precomputed', min_samples=2, n_jobs=10))

('DBSCAN #1',
 DBSCAN(eps=0.24781250000000002, metric='precomputed', min_samples=2, n_jobs=10))

In [22]:
clustering_method_name = method_name
display(clustering_method_name)

'DBSCAN'

### Generate ensemble

In [23]:
data_dist = pairwise_distances(data, metric=CLUSTERING_OPTIONS["METRIC"])

In [24]:
data_dist.shape

(3752, 3752)

In [25]:
pd.Series(data_dist.flatten()).describe().apply(str)

count            14077504.0
mean      3.277536392211914
std      1.2617443799972534
min                     0.0
25%        2.33499538898468
50%       3.572208523750305
75%       4.346373558044434
max       5.757095813751221
dtype: object

In [26]:
ensemble = generate_ensemble(
    data_dist,
    CLUSTERERS,
    attributes=CLUSTERING_ATTRIBUTES_TO_SAVE,
)

100%|██████████| 4092/4092 [06:59<00:00,  9.75it/s]


In [27]:
ensemble.shape

(287, 3)

In [28]:
ensemble.head()

Unnamed: 0_level_0,clusterer_params,partition,n_clusters
clusterer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
DBSCAN #0,"{'algorithm': 'auto', 'eps': 0.23, 'leaf_size'...","[0.0, nan, nan, nan, 0.0, nan, nan, nan, nan, ...",16
DBSCAN #1,"{'algorithm': 'auto', 'eps': 0.247812500000000...","[0.0, nan, nan, nan, 0.0, nan, nan, nan, nan, ...",14
DBSCAN #2,"{'algorithm': 'auto', 'eps': 0.265625, 'leaf_s...","[0.0, nan, nan, nan, 0.0, nan, nan, 0.0, nan, ...",9
DBSCAN #3,"{'algorithm': 'auto', 'eps': 0.2834375, 'leaf_...","[0.0, nan, 0.0, nan, 0.0, nan, nan, 0.0, nan, ...",6
DBSCAN #4,"{'algorithm': 'auto', 'eps': 0.30125, 'leaf_si...","[0.0, nan, 0.0, 0.0, 0.0, nan, nan, 0.0, nan, ...",2


In [29]:
ensemble["n_clusters"].value_counts().head()

2    237
5      9
8      7
6      7
7      6
Name: n_clusters, dtype: int64

In [30]:
ensemble_stats = ensemble["n_clusters"].describe()
display(ensemble_stats)

count    287.000000
mean       2.790941
std        2.078408
min        2.000000
25%        2.000000
50%        2.000000
75%        2.000000
max       16.000000
Name: n_clusters, dtype: float64

In [31]:
assert (
    ensemble.shape[0] > 0
), "Ensemble is empty, stopping here (this is not actually an error if running null simulations)"

### Testing

In [32]:
assert ensemble_stats["min"] > 1

In [33]:
assert not ensemble["n_clusters"].isna().any()

In [34]:
# assert ensemble.shape[0] == len(CLUSTERERS)

In [35]:
# all partitions have the right size
assert np.all(
    [part["partition"].shape[0] == data.shape[0] for idx, part in ensemble.iterrows()]
)

In [36]:
# no partition has negative clusters (noisy points)
assert not np.any([(part["partition"] < 0).any() for idx, part in ensemble.iterrows()])

### Save

In [37]:
output_filename = Path(
    RESULTS_DIR,
    generate_result_set_name(
        {},
        #         CLUSTERING_OPTIONS,
        prefix=f"{clustering_method_name}",
        suffix=".pkl",
    ),
).resolve()
display(output_filename)

PosixPath('/opt/data/results/clustering/null_sims/shuffle_lvs/runs/umap-z_score_std-projection-smultixcan-efo_partial-mashr-zscores/DBSCAN.pkl')

In [38]:
ensemble.to_pickle(output_filename)