# Description

It projects input data into a UMAP representation.

# Modules loading

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
from IPython.display import display

import pandas as pd
import umap
import matplotlib.pyplot as plt
import seaborn as sns

import conf
from utils import generate_result_set_name

# Settings

In [3]:
INPUT_FILEPATH_STEM = 'diseases_only-smultixcan-efo_partial-mashr-zscores'

In [4]:
INPUT_FILEPATH = Path(
    conf.RESULTS["DATA_TRANSFORMATIONS_DIR"],
    'z_score_std',
    f'z_score_std-{INPUT_FILEPATH_STEM}.pkl',
).resolve()
display(INPUT_FILEPATH)

input_filepath_stem = INPUT_FILEPATH.stem
display(input_filepath_stem)

PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/z_score_std/z_score_std-diseases_only-smultixcan-efo_partial-mashr-zscores.pkl')

'z_score_std-diseases_only-smultixcan-efo_partial-mashr-zscores'

In [5]:
# number of components to use in the dimensionality reduction step
DR_OPTIONS = {
    'n_components': [5, 10, 20, 30, 40, 50],
    'metric': 'euclidean',
    'n_neighbors': 15,
    'random_state': 0,
}

In [6]:
# output dir for this notebook
RESULTS_DIR = Path(
    conf.RESULTS["DATA_TRANSFORMATIONS_DIR"],
    'umap'
).resolve()
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

display(RESULTS_DIR)

PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/umap')

In [7]:
# dictionary containing all options/settings (used to generate filenames)
ALL_OPTIONS = DR_OPTIONS.copy()
# ALL_OPTIONS['proj_percentile'] = PERCENTILE_NAME

display(ALL_OPTIONS)

{'n_components': [5, 10, 20, 30, 40, 50],
 'metric': 'euclidean',
 'n_neighbors': 15,
 'random_state': 0}

# Load input file

In [8]:
data = pd.read_pickle(INPUT_FILEPATH)

In [9]:
data.shape

(538, 6452)

In [10]:
data.head()

gene_name,ENSG00000183087,ENSG00000157227,ENSG00000096696,ENSG00000175130,ENSG00000113140,ENSG00000117984,ENSG00000116016,ENSG00000129116,ENSG00000134686,ENSG00000108679,...,ENSG00000111716,ENSG00000166796,ENSG00000114331,ENSG00000131584,ENSG00000165410,ENSG00000172757,ENSG00000147862,ENSG00000008323,ENSG00000167083,ENSG00000149257
vascular disease AND intestinal disease,0.847454,1.009161,-0.834057,-0.070589,-0.329654,1.339072,-1.317406,-0.153223,0.20808,1.305476,...,-0.862693,0.178475,-0.867851,-0.318883,0.023148,0.121395,0.106954,0.282147,-0.811827,0.413489
"osteoarthritis, knee",-0.949368,2.236648,0.059025,0.312724,-1.30317,0.827997,2.295848,-0.1187,1.478273,-0.048731,...,-0.900876,1.2399,-0.357637,-0.5102,-0.233842,-0.130737,-0.066291,-0.47085,0.133876,0.992046
carpal tunnel syndrome,3.451503,0.179316,1.656231,-0.075898,0.546487,0.182674,0.303961,-0.571854,-0.822994,0.212698,...,1.314737,-0.700739,-0.570497,0.672079,-0.255689,0.473933,-0.319597,2.733735,1.24762,-0.433203
gastritis,-0.301861,0.026726,-1.095334,0.238529,-1.060668,-1.277446,1.096986,-0.490961,0.388828,2.672629,...,1.68659,-0.032626,-0.554427,0.406311,-0.72537,-0.700858,-0.669952,2.088057,-0.195165,1.154248
neoplasm,3.1862,-0.215859,-0.64035,0.748803,-1.183247,-0.773517,0.111114,-0.57906,-0.909444,-0.860679,...,0.849432,0.53319,0.851463,0.588349,-1.023563,0.226923,1.508859,0.368243,0.63663,0.131323


# UMAP

In [11]:
def get_umap_proj(orig_data, options):
    umap_options = {k:v for k, v in options.items() if k in DR_OPTIONS}
    display(f'UMAP options: {umap_options}')
    umap_obj = umap.UMAP(**umap_options)
    umap_obj = umap_obj.fit(orig_data)
    umap_data = umap_obj.transform(orig_data)
    return pd.DataFrame(
        data=umap_data,
        index=orig_data.index.copy(),
        columns=[f'UMAP{i+1}' for i in range(umap_data.shape[1])]
    )

In [12]:
# for n_comp, n_neigh in product(DR_OPTIONS['n_components'], DR_OPTIONS['n_neighbors']):
for n_comp in DR_OPTIONS['n_components']:
    print(f'# components: {n_comp}')
    
    options = ALL_OPTIONS.copy()
    options['n_components'] = n_comp
    
    dr_data = get_umap_proj(data, options)
    
    display(dr_data.shape)
    assert dr_data.shape == (data.shape[0], n_comp)
    
    display(dr_data.iloc[:, 0:5].describe())
    
    # save
    output_file = Path(
        RESULTS_DIR,
        generate_result_set_name(
            options,
            prefix=f'umap-{input_filepath_stem}-',
            suffix='.pkl'
        )
    ).resolve()
    display(output_file)
    
    dr_data.to_pickle(output_file)
    
    print('\n')

# components: 5


"UMAP options: {'n_components': 5, 'metric': 'euclidean', 'n_neighbors': 15, 'random_state': 0}"

(538, 5)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,538.0,538.0,538.0,538.0,538.0
mean,9.916106,6.716135,4.584635,8.25541,5.665169
std,2.427202,1.323215,0.595043,0.551621,1.298313
min,4.604455,4.508282,3.274413,7.229296,3.943371
25%,7.029998,5.744145,4.100205,7.738104,4.694282
50%,11.228889,6.157133,4.610241,8.288859,5.102781
75%,11.679579,8.278051,5.017382,8.714386,7.181852
max,12.785789,9.396283,6.110092,9.348644,8.177178


PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/umap/umap-z_score_std-diseases_only-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_5-n_neighbors_15-random_state_0.pkl')



# components: 10


"UMAP options: {'n_components': 10, 'metric': 'euclidean', 'n_neighbors': 15, 'random_state': 0}"

(538, 10)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,538.0,538.0,538.0,538.0,538.0
mean,8.687437,6.587051,5.339416,7.245595,5.433805
std,2.269826,1.008781,0.418648,0.645754,0.274165
min,3.832593,5.071779,4.540671,6.146819,4.695827
25%,5.989369,5.85933,4.939342,6.754952,5.267026
50%,9.994508,6.175226,5.392513,7.042087,5.445654
75%,10.273606,7.599333,5.677885,7.777954,5.606027
max,10.886015,9.067617,6.194843,9.115561,6.259279


PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/umap/umap-z_score_std-diseases_only-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_10-n_neighbors_15-random_state_0.pkl')



# components: 20


"UMAP options: {'n_components': 20, 'metric': 'euclidean', 'n_neighbors': 15, 'random_state': 0}"

(538, 20)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,538.0,538.0,538.0,538.0,538.0
mean,7.432445,2.752669,4.304295,8.77245,4.72354
std,1.966097,0.550718,0.379191,0.432511,0.381021
min,2.995693,1.649845,3.723641,7.980368,3.788003
25%,5.060245,2.114124,4.002829,8.44728,4.36628
50%,8.599894,2.994018,4.180148,8.624469,4.840492
75%,8.838341,3.176611,4.701052,9.2534,4.995247
max,9.128508,3.570746,5.110425,9.615664,5.348433


PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/umap/umap-z_score_std-diseases_only-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_20-n_neighbors_15-random_state_0.pkl')



# components: 30


"UMAP options: {'n_components': 30, 'metric': 'euclidean', 'n_neighbors': 15, 'random_state': 0}"

(538, 30)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,538.0,538.0,538.0,538.0,538.0
mean,7.650585,2.487923,4.266134,7.959554,5.350358
std,1.633021,0.137136,0.161323,0.492473,0.348095
min,4.033622,2.006037,3.944194,7.184736,4.891445
25%,5.673139,2.399465,4.13638,7.591557,5.080006
50%,8.616965,2.504416,4.245038,7.794735,5.21918
75%,8.796295,2.582121,4.38658,8.464794,5.64528
max,9.206652,2.816615,4.755887,9.068613,6.346341


PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/umap/umap-z_score_std-diseases_only-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_30-n_neighbors_15-random_state_0.pkl')



# components: 40


"UMAP options: {'n_components': 40, 'metric': 'euclidean', 'n_neighbors': 15, 'random_state': 0}"

(538, 40)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,538.0,538.0,538.0,538.0,538.0
mean,7.179404,2.781799,4.460528,8.910821,5.618299
std,1.786826,0.277461,0.39435,0.129893,0.540176
min,3.365756,2.27909,3.826882,8.632036,4.437049
25%,5.006432,2.576856,4.171919,8.831008,4.982498
50%,8.237466,2.702696,4.316584,8.897143,5.857323
75%,8.447595,3.037609,4.848925,8.987631,6.010362
max,8.866558,3.535369,5.513491,9.278026,6.382192


PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/umap/umap-z_score_std-diseases_only-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_40-n_neighbors_15-random_state_0.pkl')



# components: 50


"UMAP options: {'n_components': 50, 'metric': 'euclidean', 'n_neighbors': 15, 'random_state': 0}"

(538, 50)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,538.0,538.0,538.0,538.0,538.0
mean,7.985046,2.601374,4.829426,8.431797,6.059007
std,2.297926,0.278715,0.191931,0.181283,0.560595
min,2.842567,1.874109,4.428329,8.073075,4.749494
25%,5.216704,2.347823,4.660509,8.297781,5.46429
50%,9.395594,2.698653,4.820894,8.411202,6.322286
75%,9.576528,2.803142,4.967501,8.572313,6.481516
max,10.044398,3.033765,5.325864,8.810924,6.74529


PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/umap/umap-z_score_std-diseases_only-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_50-n_neighbors_15-random_state_0.pkl')



