# Description

Runs hierarchical clustering on the z_score_std version of the data.

# Environment variables

In [None]:
from IPython.display import display

import conf

N_JOBS = conf.GENERAL["N_JOBS"]
display(N_JOBS)

In [None]:
%env MKL_NUM_THREADS=$N_JOBS
%env OPEN_BLAS_NUM_THREADS=$N_JOBS
%env NUMEXPR_NUM_THREADS=$N_JOBS
%env OMP_NUM_THREADS=$N_JOBS

# Modules loading

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from pathlib import Path

import numpy as np
import pandas as pd

from utils import generate_result_set_name

# Settings

In [None]:
np.random.seed(0)

## Input data

In [None]:
INPUT_SUBSET = "z_score_std"

In [None]:
INPUT_STEM = "projection-smultixcan-efo_partial-mashr-zscores"

In [None]:
input_filepath = Path(
    conf.RESULTS["CLUSTERING_NULL_DIR"],
    "data_transformations",
    INPUT_SUBSET,
    f"{INPUT_SUBSET}-{INPUT_STEM}.pkl",
).resolve()
display(input_filepath)

assert input_filepath.exists(), "Input file does not exist"

input_filepath_stem = input_filepath.stem
display(input_filepath_stem)

## Clustering

In [None]:
from sklearn.cluster import AgglomerativeClustering

In [None]:
CLUSTERING_ATTRIBUTES_TO_SAVE = ["n_clusters"]

In [None]:
CLUSTERING_OPTIONS = {}

CLUSTERING_OPTIONS["K_MIN"] = 2
CLUSTERING_OPTIONS["K_MAX"] = 75  # sqrt(3749) + some more to get closer to 295
CLUSTERING_OPTIONS["LINKAGE"] = {"ward", "complete", "average", "single"}
CLUSTERING_OPTIONS["AFFINITY"] = "euclidean"

display(CLUSTERING_OPTIONS)

In [None]:
CLUSTERERS = {}

idx = 0

for k in range(CLUSTERING_OPTIONS["K_MIN"], CLUSTERING_OPTIONS["K_MAX"] + 1):
    for linkage in CLUSTERING_OPTIONS["LINKAGE"]:
        if linkage == "ward":
            affinity = "euclidean"
        else:
            affinity = "precomputed"

        clus = AgglomerativeClustering(
            n_clusters=k,
            affinity=affinity,
            linkage=linkage,
        )

        method_name = type(clus).__name__
        CLUSTERERS[f"{method_name} #{idx}"] = clus

        idx = idx + 1

In [None]:
display(len(CLUSTERERS))

In [None]:
_iter = iter(CLUSTERERS.items())
display(next(_iter))
display(next(_iter))

In [None]:
clustering_method_name = method_name
display(clustering_method_name)

## Output directory

In [None]:
# output dir for this notebook
RESULTS_DIR = Path(
    conf.RESULTS["CLUSTERING_NULL_DIR"],
    "runs",
    f"{INPUT_SUBSET}-{INPUT_STEM}",
).resolve()
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

display(RESULTS_DIR)

# Load input file

In [None]:
data = pd.read_pickle(input_filepath)

In [None]:
data.shape

In [None]:
data.head()

In [None]:
assert not data.isna().any().any()

# Clustering

## Generate ensemble

In [None]:
from sklearn.metrics import pairwise_distances
from clustering.ensembles.utils import generate_ensemble

In [None]:
data_dist = pairwise_distances(data, metric=CLUSTERING_OPTIONS["AFFINITY"])

In [None]:
data_dist.shape

In [None]:
pd.Series(data_dist.flatten()).describe().apply(str)

In [None]:
ensemble = generate_ensemble(
    data_dist,
    CLUSTERERS,
    attributes=CLUSTERING_ATTRIBUTES_TO_SAVE,
    affinity_matrix=data_dist,
)

In [None]:
# the number should be close to 295 (the number of partitions generated by k-means/spectral clustering)
ensemble.shape

In [None]:
ensemble.head()

In [None]:
ensemble["n_clusters"].value_counts().head()

In [None]:
ensemble_stats = ensemble["n_clusters"].describe()
display(ensemble_stats)

### Testing

In [None]:
assert ensemble_stats["min"] > 1

In [None]:
assert not ensemble["n_clusters"].isna().any()

In [None]:
assert ensemble.shape[0] == len(CLUSTERERS)

In [None]:
# all partitions have the right size
assert np.all(
    [part["partition"].shape[0] == data.shape[0] for idx, part in ensemble.iterrows()]
)

In [None]:
# no partition has negative clusters (noisy points)
assert not np.any([(part["partition"] < 0).any() for idx, part in ensemble.iterrows()])

## Save

In [None]:
del CLUSTERING_OPTIONS["LINKAGE"]

output_filename = Path(
    RESULTS_DIR,
    generate_result_set_name(
        CLUSTERING_OPTIONS,
        prefix=f"{clustering_method_name}-",
        suffix=".pkl",
    ),
).resolve()
display(output_filename)

In [None]:
ensemble.to_pickle(output_filename)