# Description

It projects input data into a UMAP representation.

# Modules loading

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
from IPython.display import display

import pandas as pd
import umap
import matplotlib.pyplot as plt
import seaborn as sns

import conf
from utils import generate_result_set_name

# Settings

In [3]:
INPUT_FILEPATH_STEM = 'diseases_only-projection-smultixcan-efo_partial-mashr-zscores'

In [4]:
INPUT_FILEPATH = Path(
    conf.RESULTS["DATA_TRANSFORMATIONS_DIR"],
    'z_score_std',
    f'z_score_std-{INPUT_FILEPATH_STEM}.pkl',
).resolve()
display(INPUT_FILEPATH)

input_filepath_stem = INPUT_FILEPATH.stem
display(input_filepath_stem)

PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/z_score_std/z_score_std-diseases_only-projection-smultixcan-efo_partial-mashr-zscores.pkl')

'z_score_std-diseases_only-projection-smultixcan-efo_partial-mashr-zscores'

In [5]:
# number of components to use in the dimensionality reduction step
DR_OPTIONS = {
    'n_components': [5, 10, 20, 30, 40, 50],
    'metric': 'euclidean',
    'n_neighbors': 15,
    'random_state': 0,
}

In [6]:
# output dir for this notebook
RESULTS_DIR = Path(
    conf.RESULTS["DATA_TRANSFORMATIONS_DIR"],
    'umap'
).resolve()
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

display(RESULTS_DIR)

PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/umap')

In [7]:
# dictionary containing all options/settings (used to generate filenames)
ALL_OPTIONS = DR_OPTIONS.copy()
# ALL_OPTIONS['proj_percentile'] = PERCENTILE_NAME

display(ALL_OPTIONS)

{'n_components': [5, 10, 20, 30, 40, 50],
 'metric': 'euclidean',
 'n_neighbors': 15,
 'random_state': 0}

# Load input file

In [8]:
data = pd.read_pickle(INPUT_FILEPATH)

In [9]:
data.shape

(538, 987)

In [10]:
data.head()

Unnamed: 0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10,...,LV978,LV979,LV980,LV981,LV982,LV983,LV984,LV985,LV986,LV987
vascular disease AND intestinal disease,0.374886,0.12301,-0.688734,-0.462234,-0.270475,0.76501,1.101153,0.443515,-2.131315,-1.989014,...,-0.084253,0.291335,-0.829496,-0.944208,0.070689,-0.310739,-1.05439,-0.058465,-1.7184,1.318405
"osteoarthritis, knee",-1.006925,-0.523367,-0.205909,0.845292,-0.520447,-1.567783,-1.392684,-0.697141,2.420293,-1.249693,...,-0.859185,0.803672,-0.792661,0.941227,0.038807,-0.33575,-1.109207,0.4264,1.15183,0.396533
carpal tunnel syndrome,1.113546,0.801889,-0.16502,-0.339,-1.40132,-0.825885,-1.224864,0.230387,0.477523,1.075486,...,-0.431015,2.656686,-1.755027,1.159056,-0.570832,-0.572341,0.155003,-0.527333,-1.350922,-0.264405
gastritis,-1.025944,0.108498,0.328317,-0.294424,-1.267212,-1.419151,-1.460856,1.192783,-0.183589,0.019485,...,1.252517,1.402279,1.354473,0.175315,0.654345,-0.761176,0.942745,-0.18181,0.574468,1.381791
neoplasm,0.400556,0.014828,-0.525315,0.561977,0.252577,-0.734367,0.345537,1.808464,-0.74147,0.52955,...,-0.175486,-0.224286,0.25454,-0.302883,-0.583656,0.798698,0.653916,0.55587,1.097121,-1.20567


# UMAP

In [11]:
def get_umap_proj(orig_data, options):
    umap_options = {k:v for k, v in options.items() if k in DR_OPTIONS}
    display(f'UMAP options: {umap_options}')
    umap_obj = umap.UMAP(**umap_options)
    umap_obj = umap_obj.fit(orig_data)
    umap_data = umap_obj.transform(orig_data)
    return pd.DataFrame(
        data=umap_data,
        index=orig_data.index.copy(),
        columns=[f'UMAP{i+1}' for i in range(umap_data.shape[1])]
    )

In [12]:
# for n_comp, n_neigh in product(DR_OPTIONS['n_components'], DR_OPTIONS['n_neighbors']):
for n_comp in DR_OPTIONS['n_components']:
    print(f'# components: {n_comp}')
    
    options = ALL_OPTIONS.copy()
    options['n_components'] = n_comp
    
    dr_data = get_umap_proj(data, options)
    
    display(dr_data.shape)
    assert dr_data.shape == (data.shape[0], n_comp)
    
    display(dr_data.iloc[:, 0:5].describe())
    
    # save
    output_file = Path(
        RESULTS_DIR,
        generate_result_set_name(
            options,
            prefix=f'umap-{input_filepath_stem}-',
            suffix='.pkl'
        )
    ).resolve()
    display(output_file)
    
    dr_data.to_pickle(output_file)
    
    print('\n')

# components: 5


"UMAP options: {'n_components': 5, 'metric': 'euclidean', 'n_neighbors': 15, 'random_state': 0}"

(538, 5)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,538.0,538.0,538.0,538.0,538.0
mean,7.96961,4.574231,12.713342,7.108882,9.851263
std,0.406659,0.877489,1.562392,0.795271,0.473089
min,6.965586,3.061604,9.060485,5.736965,8.937288
25%,7.675872,3.913773,11.471681,6.477954,9.443687
50%,8.002914,4.366972,13.436659,6.964344,9.768209
75%,8.25746,5.254064,13.865288,7.724038,10.224056
max,9.106561,6.588826,14.614759,8.692741,11.043744


PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/umap/umap-z_score_std-diseases_only-projection-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_5-n_neighbors_15-random_state_0.pkl')



# components: 10


"UMAP options: {'n_components': 10, 'metric': 'euclidean', 'n_neighbors': 15, 'random_state': 0}"

(538, 10)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,538.0,538.0,538.0,538.0,538.0
mean,10.263988,6.099322,10.310119,5.472342,8.682941
std,0.998204,0.291843,0.290281,1.061089,0.349931
min,7.834297,5.231123,9.715877,4.069211,8.010818
25%,9.440652,5.909181,10.107491,4.679148,8.393363
50%,10.692434,6.077678,10.295436,5.033515,8.658119
75%,10.973258,6.288956,10.499251,6.303513,8.934837
max,11.536099,6.86732,11.057426,7.902147,9.532352


PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/umap/umap-z_score_std-diseases_only-projection-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_10-n_neighbors_15-random_state_0.pkl')



# components: 20


"UMAP options: {'n_components': 20, 'metric': 'euclidean', 'n_neighbors': 15, 'random_state': 0}"

(538, 20)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,538.0,538.0,538.0,538.0,538.0
mean,7.354422,6.238368,0.198971,7.641459,9.150332
std,0.885932,0.520133,0.213093,0.429021,0.286663
min,5.086062,5.393691,-0.548934,6.918926,8.421145
25%,6.53724,5.862307,0.054211,7.31842,8.918267
50%,7.700912,6.040362,0.220266,7.524281,9.181055
75%,7.992031,6.681155,0.37836,8.005546,9.355865
max,8.683851,7.341183,0.631639,8.620175,9.837655


PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/umap/umap-z_score_std-diseases_only-projection-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_20-n_neighbors_15-random_state_0.pkl')



# components: 30


"UMAP options: {'n_components': 30, 'metric': 'euclidean', 'n_neighbors': 15, 'random_state': 0}"

(538, 30)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,538.0,538.0,538.0,538.0,538.0
mean,9.214407,6.15493,1.578305,8.588137,8.855339
std,0.869742,0.289568,0.134017,0.348848,0.184242
min,7.071675,5.613863,1.279797,7.900719,8.492102
25%,8.514426,5.929752,1.476557,8.262686,8.71066
50%,9.659132,6.074119,1.560822,8.686519,8.830552
75%,9.826126,6.37608,1.663226,8.869854,8.977724
max,10.23988,6.848249,1.959341,9.20896,9.386401


PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/umap/umap-z_score_std-diseases_only-projection-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_30-n_neighbors_15-random_state_0.pkl')



# components: 40


"UMAP options: {'n_components': 40, 'metric': 'euclidean', 'n_neighbors': 15, 'random_state': 0}"

(538, 40)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,538.0,538.0,538.0,538.0,538.0
mean,8.146441,7.761971,-0.047784,8.418512,7.675556
std,0.473701,0.125247,0.114372,0.10008,0.323692
min,7.08205,7.412344,-0.358256,8.070831,7.018172
25%,7.771304,7.669454,-0.136633,8.356688,7.42897
50%,8.297474,7.761039,-0.058676,8.420005,7.596699
75%,8.501098,7.843117,0.0272,8.48434,7.884751
max,8.920479,8.120782,0.361525,8.782947,8.382937


PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/umap/umap-z_score_std-diseases_only-projection-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_40-n_neighbors_15-random_state_0.pkl')



# components: 50


"UMAP options: {'n_components': 50, 'metric': 'euclidean', 'n_neighbors': 15, 'random_state': 0}"

(538, 50)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,538.0,538.0,538.0,538.0,538.0
mean,7.47836,7.180451,0.199628,8.971105,6.837708
std,0.470957,0.239597,0.192417,0.156052,0.638597
min,6.44676,6.688674,-0.193593,8.587032,5.959533
25%,7.074446,7.000153,0.048597,8.866159,6.389824
50%,7.681846,7.163648,0.190888,8.962038,6.542828
75%,7.823059,7.375129,0.337098,9.070706,7.384642
max,8.075608,7.730057,0.632481,9.387962,8.329343


PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/umap/umap-z_score_std-diseases_only-projection-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_50-n_neighbors_15-random_state_0.pkl')



