# Description

It projects input data into a UMAP representation.

# Modules loading

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
from IPython.display import display

import pandas as pd
import umap
import matplotlib.pyplot as plt
import seaborn as sns

import conf
from utils import generate_result_set_name

# Settings

In [3]:
INPUT_FILEPATH_STEM = 'diseases_only-smultixcan-efo_partial-mashr-zscores'

In [4]:
INPUT_FILEPATH = Path(
    conf.RESULTS["DATA_TRANSFORMATIONS_DIR"],
    'z_score_std',
    f'z_score_std-{INPUT_FILEPATH_STEM}.pkl',
).resolve()
display(INPUT_FILEPATH)

input_filepath_stem = INPUT_FILEPATH.stem
display(input_filepath_stem)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/data_transformations/z_score_std/z_score_std-diseases_only-smultixcan-efo_partial-mashr-zscores.pkl')

'z_score_std-diseases_only-smultixcan-efo_partial-mashr-zscores'

In [5]:
# number of components to use in the dimensionality reduction step
DR_OPTIONS = {
    'n_components': [5, 10, 20, 30, 40, 50],
    'metric': 'euclidean',
    'n_neighbors': 15,
    'random_state': 0,
}

In [6]:
# output dir for this notebook
RESULTS_DIR = Path(
    conf.RESULTS["DATA_TRANSFORMATIONS_DIR"],
    'umap'
).resolve()
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

display(RESULTS_DIR)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/data_transformations/umap')

In [7]:
# dictionary containing all options/settings (used to generate filenames)
ALL_OPTIONS = DR_OPTIONS.copy()
# ALL_OPTIONS['proj_percentile'] = PERCENTILE_NAME

display(ALL_OPTIONS)

{'n_components': [5, 10, 20, 30, 40, 50],
 'metric': 'euclidean',
 'n_neighbors': 15,
 'random_state': 0}

# Load input file

In [8]:
data = pd.read_pickle(INPUT_FILEPATH)

In [9]:
data.shape

(538, 6452)

In [10]:
data.head()

gene_name,ENSG00000183087,ENSG00000157227,ENSG00000096696,ENSG00000175130,ENSG00000113140,ENSG00000117984,ENSG00000116016,ENSG00000129116,ENSG00000134686,ENSG00000108679,...,ENSG00000111716,ENSG00000166796,ENSG00000114331,ENSG00000131584,ENSG00000165410,ENSG00000172757,ENSG00000147862,ENSG00000008323,ENSG00000167083,ENSG00000149257
vascular disease AND intestinal disease,0.847454,1.009161,-0.834057,-0.070589,-0.329654,1.339072,-1.317406,-0.153223,0.20808,1.305476,...,-0.862693,0.178475,-0.867851,-0.318883,0.023148,0.121395,0.106954,0.282147,-0.811827,0.413489
"osteoarthritis, knee",-0.949368,2.236648,0.059025,0.312724,-1.30317,0.827997,2.295848,-0.1187,1.478273,-0.048731,...,-0.900876,1.2399,-0.357637,-0.5102,-0.233842,-0.130737,-0.066291,-0.47085,0.133876,0.992046
carpal tunnel syndrome,3.451503,0.179316,1.656231,-0.075898,0.546487,0.182674,0.303961,-0.571854,-0.822994,0.212698,...,1.314737,-0.700739,-0.570497,0.672079,-0.255689,0.473933,-0.319597,2.733735,1.24762,-0.433203
gastritis,-0.301861,0.026726,-1.095334,0.238529,-1.060668,-1.277446,1.096986,-0.490961,0.388828,2.672629,...,1.68659,-0.032626,-0.554427,0.406311,-0.72537,-0.700858,-0.669952,2.088057,-0.195165,1.154248
neoplasm,3.1862,-0.215859,-0.64035,0.748803,-1.183247,-0.773517,0.111114,-0.57906,-0.909444,-0.860679,...,0.849432,0.53319,0.851463,0.588349,-1.023563,0.226923,1.508859,0.368243,0.63663,0.131323


# UMAP

In [11]:
def get_umap_proj(orig_data, options):
    umap_options = {k:v for k, v in options.items() if k in DR_OPTIONS}
    display(f'UMAP options: {umap_options}')
    umap_obj = umap.UMAP(**umap_options)
    umap_obj = umap_obj.fit(orig_data)
    umap_data = umap_obj.transform(orig_data)
    return pd.DataFrame(
        data=umap_data,
        index=orig_data.index.copy(),
        columns=[f'UMAP{i+1}' for i in range(umap_data.shape[1])]
    )

In [12]:
# for n_comp, n_neigh in product(DR_OPTIONS['n_components'], DR_OPTIONS['n_neighbors']):
for n_comp in DR_OPTIONS['n_components']:
    print(f'# components: {n_comp}')
    
    options = ALL_OPTIONS.copy()
    options['n_components'] = n_comp
    
    dr_data = get_umap_proj(data, options)
    
    display(dr_data.shape)
    assert dr_data.shape == (data.shape[0], n_comp)
    
    display(dr_data.iloc[:, 0:5].describe())
    
    # save
    output_file = Path(
        RESULTS_DIR,
        generate_result_set_name(
            options,
            prefix=f'umap-{input_filepath_stem}-',
            suffix='.pkl'
        )
    ).resolve()
    display(output_file)
    
    dr_data.to_pickle(output_file)
    
    print('\n')

# components: 5


"UMAP options: {'n_components': 5, 'metric': 'euclidean', 'n_neighbors': 15, 'random_state': 0}"

(538, 5)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,538.0,538.0,538.0,538.0,538.0
mean,7.679551,7.936632,3.632716,9.876525,7.316477
std,2.656788,0.655753,0.391828,0.487032,0.620014
min,1.815232,6.598701,2.57869,9.074773,6.057494
25%,4.638641,7.429265,3.398335,9.442492,6.824173
50%,9.144253,7.852081,3.622641,9.788501,7.294547
75%,9.579413,8.496892,3.865244,10.2495,7.769266
max,10.745596,9.309904,4.981396,10.972158,8.810878


PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/data_transformations/umap/umap-z_score_std-diseases_only-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_5-n_neighbors_15-random_state_0.pkl')



# components: 10


"UMAP options: {'n_components': 10, 'metric': 'euclidean', 'n_neighbors': 15, 'random_state': 0}"

(538, 10)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,538.0,538.0,538.0,538.0,538.0
mean,7.127672,8.032342,5.547537,6.817735,5.702674
std,1.757436,0.46789,0.416089,1.963808,0.552181
min,3.377839,7.11008,4.808387,4.820794,4.322804
25%,5.011455,7.559957,5.153482,5.437932,5.177649
50%,8.082147,8.107326,5.594849,5.71063,5.864573
75%,8.42413,8.441905,5.897865,9.083092,6.096544
max,8.964018,8.934828,6.497295,11.146702,6.662196


PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/data_transformations/umap/umap-z_score_std-diseases_only-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_10-n_neighbors_15-random_state_0.pkl')



# components: 20


"UMAP options: {'n_components': 20, 'metric': 'euclidean', 'n_neighbors': 15, 'random_state': 0}"

(538, 20)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,538.0,538.0,538.0,538.0,538.0
mean,8.115372,2.932507,5.10305,8.423316,4.496166
std,1.951931,0.334889,0.568414,0.916021,0.323663
min,3.618614,2.23747,3.896196,7.327308,3.958051
25%,5.854597,2.646748,4.475454,7.751944,4.23586
50%,9.300107,2.972522,5.335828,7.983537,4.407367
75%,9.474627,3.196471,5.536899,9.51064,4.789108
max,9.780196,3.643408,5.982479,10.478279,5.297492


PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/data_transformations/umap/umap-z_score_std-diseases_only-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_20-n_neighbors_15-random_state_0.pkl')



# components: 30


"UMAP options: {'n_components': 30, 'metric': 'euclidean', 'n_neighbors': 15, 'random_state': 0}"

(538, 30)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,538.0,538.0,538.0,538.0,538.0
mean,7.562059,1.884322,3.802493,8.56341,5.143482
std,1.56849,0.237603,0.577048,0.577803,0.628588
min,3.993982,1.308651,2.977509,7.828005,4.385872
25%,5.672941,1.709219,3.366018,8.12505,4.684777
50%,8.486971,1.881908,3.532596,8.281985,4.823236
75%,8.65548,2.06534,4.432286,9.253338,5.83246
max,8.970595,2.46367,5.1208,9.75533,6.648346


PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/data_transformations/umap/umap-z_score_std-diseases_only-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_30-n_neighbors_15-random_state_0.pkl')



# components: 40


"UMAP options: {'n_components': 40, 'metric': 'euclidean', 'n_neighbors': 15, 'random_state': 0}"

(538, 40)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,538.0,538.0,538.0,538.0,538.0
mean,7.266353,2.663847,4.668249,8.397497,5.956469
std,1.625993,0.305332,0.1304,0.360706,0.477562
min,3.646526,1.772859,4.400087,7.721803,4.973301
25%,5.330475,2.439738,4.570608,7.990496,5.413934
50%,8.223419,2.745021,4.656929,8.509551,6.113065
75%,8.404471,2.868153,4.742877,8.679117,6.327758
max,8.891572,3.254819,5.03673,9.013394,6.695738


PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/data_transformations/umap/umap-z_score_std-diseases_only-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_40-n_neighbors_15-random_state_0.pkl')



# components: 50


"UMAP options: {'n_components': 50, 'metric': 'euclidean', 'n_neighbors': 15, 'random_state': 0}"

(538, 50)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,538.0,538.0,538.0,538.0,538.0
mean,7.731828,2.548878,4.49763,9.170915,5.277064
std,2.057031,0.306816,0.293413,0.21239,0.106116
min,3.192575,1.778922,3.808914,8.73273,5.003058
25%,5.244747,2.266116,4.192042,9.010317,5.203112
50%,8.982694,2.625825,4.607757,9.139885,5.261964
75%,9.158877,2.798548,4.726458,9.336854,5.350678
max,9.570532,3.100802,4.948246,9.702306,5.596568


PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/data_transformations/umap/umap-z_score_std-diseases_only-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_50-n_neighbors_15-random_state_0.pkl')



