# Description

It prepares the data to create a clustering tree visualization (using the R package `clustree`).

# Modules loading

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from IPython.display import display
from pathlib import Path

import numpy as np
import pandas as pd

from utils import generate_result_set_name
import conf

# Settings

In [3]:
CONSENSUS_CLUSTERING_DIR = Path(
    conf.RESULTS["CLUSTERING_DIR"], "consensus_clustering"
).resolve()

display(CONSENSUS_CLUSTERING_DIR)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/clustering/consensus_clustering')

# Load data

## PCA

In [4]:
INPUT_SUBSET = "pca"

In [5]:
INPUT_STEM = "z_score_std-projection-smultixcan-efo_partial-mashr-zscores"

In [6]:
DR_OPTIONS = {
    "n_components": 50,
    "svd_solver": "full",
    "random_state": 0,
}

In [7]:
input_filepath = Path(
    conf.RESULTS["DATA_TRANSFORMATIONS_DIR"],
    INPUT_SUBSET,
    generate_result_set_name(
        DR_OPTIONS, prefix=f"{INPUT_SUBSET}-{INPUT_STEM}-", suffix=".pkl"
    ),
).resolve()
display(input_filepath)

assert input_filepath.exists(), "Input file does not exist"

input_filepath_stem = input_filepath.stem
display(input_filepath_stem)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/data_transformations/pca/pca-z_score_std-projection-smultixcan-efo_partial-mashr-zscores-n_components_50-random_state_0-svd_solver_full.pkl')

'pca-z_score_std-projection-smultixcan-efo_partial-mashr-zscores-n_components_50-random_state_0-svd_solver_full'

In [8]:
data_pca = pd.read_pickle(input_filepath).iloc[:, :5]

In [9]:
data_pca.shape

(3752, 5)

In [10]:
data_pca.head()

Unnamed: 0,PCA1,PCA2,PCA3,PCA4,PCA5
100001_raw-Food_weight,0.805216,-0.86539,0.69948,-0.065976,0.999617
100002_raw-Energy,0.588507,-1.491772,1.75634,-3.593295,2.100607
100003_raw-Protein,1.91016,-1.873687,1.876677,-3.832557,1.240704
100004_raw-Fat,0.750799,-0.294733,1.31771,-1.346081,2.006403
100005_raw-Carbohydrate,-0.530044,-0.007398,0.611418,-3.604094,2.227872


## UMAP

In [11]:
INPUT_SUBSET = "umap"

In [12]:
INPUT_STEM = "z_score_std-projection-smultixcan-efo_partial-mashr-zscores"

In [13]:
DR_OPTIONS = {
    "n_components": 5,
    "metric": "euclidean",
    "n_neighbors": 15,
    "random_state": 0,
}

In [14]:
input_filepath = Path(
    conf.RESULTS["DATA_TRANSFORMATIONS_DIR"],
    INPUT_SUBSET,
    generate_result_set_name(
        DR_OPTIONS, prefix=f"{INPUT_SUBSET}-{INPUT_STEM}-", suffix=".pkl"
    ),
).resolve()
display(input_filepath)

assert input_filepath.exists(), "Input file does not exist"

input_filepath_stem = input_filepath.stem
display(input_filepath_stem)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/data_transformations/umap/umap-z_score_std-projection-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_5-n_neighbors_15-random_state_0.pkl')

'umap-z_score_std-projection-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_5-n_neighbors_15-random_state_0'

In [15]:
data_umap = pd.read_pickle(input_filepath)

In [16]:
data_umap.shape

(3752, 5)

In [17]:
data_umap.head()

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
100001_raw-Food_weight,0.426554,0.670532,7.363805,1.171837,6.297295
100002_raw-Energy,-1.605179,0.815699,8.288521,0.990394,6.817351
100003_raw-Protein,-1.656178,0.788297,8.355906,1.017072,6.845651
100004_raw-Fat,-1.508325,0.802536,8.328274,1.033939,6.709319
100005_raw-Carbohydrate,-1.617872,0.812711,8.307973,1.020575,6.825944


# Load selected best partitions

In [18]:
input_file = Path(CONSENSUS_CLUSTERING_DIR, "best_partitions_by_k.pkl").resolve()
display(input_file)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/clustering/consensus_clustering/best_partitions_by_k.pkl')

In [19]:
best_partitions = pd.read_pickle(input_file)

In [20]:
best_partitions.shape

(59, 4)

In [21]:
best_partitions.head()

Unnamed: 0_level_0,method,partition,ari_mean,selected
k,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
8,scc_020,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.224684,True
10,scc_025,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.224565,True
7,scc_020,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.221514,True
6,scc_020,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.2215,True
12,scc_020,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.220312,True


# Prepare data for clustrees

In [22]:
clustrees_df = pd.concat((data_pca, data_umap), join="inner", axis=1)

In [23]:
display(clustrees_df.shape)
assert clustrees_df.shape == (data_pca.shape[0], data_pca.shape[1] + data_umap.shape[1])

(3752, 10)

In [24]:
clustrees_df.head()

Unnamed: 0,PCA1,PCA2,PCA3,PCA4,PCA5,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
100001_raw-Food_weight,0.805216,-0.86539,0.69948,-0.065976,0.999617,0.426554,0.670532,7.363805,1.171837,6.297295
100002_raw-Energy,0.588507,-1.491772,1.75634,-3.593295,2.100607,-1.605179,0.815699,8.288521,0.990394,6.817351
100003_raw-Protein,1.91016,-1.873687,1.876677,-3.832557,1.240704,-1.656178,0.788297,8.355906,1.017072,6.845651
100004_raw-Fat,0.750799,-0.294733,1.31771,-1.346081,2.006403,-1.508325,0.802536,8.328274,1.033939,6.709319
100005_raw-Carbohydrate,-0.530044,-0.007398,0.611418,-3.604094,2.227872,-1.617872,0.812711,8.307973,1.020575,6.825944


## Add partitions

In [25]:
_tmp = np.unique(
    [best_partitions.loc[k, "partition"].shape for k in best_partitions.index]
)
display(_tmp)
assert _tmp.shape[0] == 1
assert _tmp[0] == data_umap.shape[0] == data_pca.shape[0]

array([3752])

In [26]:
assert not best_partitions.isna().any().any()

In [27]:
# df = df.assign(**{f'k{k}': partitions.loc[k, 'partition'] for k in selected_k_values})
clustrees_df = clustrees_df.assign(
    **{
        f"k{k}": best_partitions.loc[k, "partition"]
        for k in best_partitions.index
        if best_partitions.loc[k, "selected"]
    }
)

In [28]:
clustrees_df.index.rename("trait", inplace=True)

In [29]:
clustrees_df.shape

(3752, 25)

In [30]:
clustrees_df.head()

Unnamed: 0_level_0,PCA1,PCA2,PCA3,PCA4,PCA5,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5,...,k11,k9,k18,k17,k16,k21,k20,k19,k13,k5
trait,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100001_raw-Food_weight,0.805216,-0.86539,0.69948,-0.065976,0.999617,0.426554,0.670532,7.363805,1.171837,6.297295,...,0,0,0,1,1,0,0,0,12,0
100002_raw-Energy,0.588507,-1.491772,1.75634,-3.593295,2.100607,-1.605179,0.815699,8.288521,0.990394,6.817351,...,0,0,0,1,1,18,17,17,12,0
100003_raw-Protein,1.91016,-1.873687,1.876677,-3.832557,1.240704,-1.656178,0.788297,8.355906,1.017072,6.845651,...,0,0,0,1,1,18,17,17,12,0
100004_raw-Fat,0.750799,-0.294733,1.31771,-1.346081,2.006403,-1.508325,0.802536,8.328274,1.033939,6.709319,...,0,0,0,1,1,18,17,17,12,0
100005_raw-Carbohydrate,-0.530044,-0.007398,0.611418,-3.604094,2.227872,-1.617872,0.812711,8.307973,1.020575,6.825944,...,0,0,0,1,1,18,17,17,12,0


In [31]:
# make sure partitions were assigned correctly
assert (
    np.unique(
        [
            clustrees_df[f"{k}"].value_counts().sum()
            for k in clustrees_df.columns[
                clustrees_df.columns.str.contains("^k[0-9]+$", regex=True)
            ]
        ]
    )[0]
    == data_pca.shape[0]
)

# Save

In [37]:
output_file = Path(CONSENSUS_CLUSTERING_DIR, "clustering_tree_data.tsv").resolve()
display(output_file)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/clustering/consensus_clustering/clustering_tree_data.tsv')

In [38]:
clustrees_df.to_csv(output_file, sep="\t")