# Description

It projects input data into a UMAP representation.

# Modules loading

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
from IPython.display import display

import pandas as pd
import umap
import matplotlib.pyplot as plt
import seaborn as sns

import conf
from utils import generate_result_set_name

# Settings

In [3]:
INPUT_FILEPATH_STEM = 'diseases_only-projection-smultixcan-efo_partial-mashr-zscores'

In [4]:
INPUT_FILEPATH = Path(
    conf.RESULTS["DATA_TRANSFORMATIONS_DIR"],
    'z_score_std',
    f'z_score_std-{INPUT_FILEPATH_STEM}.pkl',
).resolve()
display(INPUT_FILEPATH)

input_filepath_stem = INPUT_FILEPATH.stem
display(input_filepath_stem)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/data_transformations/z_score_std/z_score_std-diseases_only-projection-smultixcan-efo_partial-mashr-zscores.pkl')

'z_score_std-diseases_only-projection-smultixcan-efo_partial-mashr-zscores'

In [5]:
# number of components to use in the dimensionality reduction step
DR_OPTIONS = {
    'n_components': [5, 10, 20, 30, 40, 50],
    'metric': 'euclidean',
    'n_neighbors': 15,
    'random_state': 0,
}

In [6]:
# output dir for this notebook
RESULTS_DIR = Path(
    conf.RESULTS["DATA_TRANSFORMATIONS_DIR"],
    'umap'
).resolve()
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

display(RESULTS_DIR)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/data_transformations/umap')

In [7]:
# dictionary containing all options/settings (used to generate filenames)
ALL_OPTIONS = DR_OPTIONS.copy()
# ALL_OPTIONS['proj_percentile'] = PERCENTILE_NAME

display(ALL_OPTIONS)

{'n_components': [5, 10, 20, 30, 40, 50],
 'metric': 'euclidean',
 'n_neighbors': 15,
 'random_state': 0}

# Load input file

In [8]:
data = pd.read_pickle(INPUT_FILEPATH)

In [9]:
data.shape

(538, 987)

In [10]:
data.head()

Unnamed: 0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10,...,LV978,LV979,LV980,LV981,LV982,LV983,LV984,LV985,LV986,LV987
vascular disease AND intestinal disease,0.374886,0.12301,-0.688734,-0.462234,-0.270475,0.76501,1.101153,0.443515,-2.131315,-1.989014,...,-0.084253,0.291335,-0.829496,-0.944208,0.070689,-0.310739,-1.05439,-0.058465,-1.7184,1.318405
"osteoarthritis, knee",-1.006925,-0.523367,-0.205909,0.845292,-0.520447,-1.567783,-1.392684,-0.697141,2.420293,-1.249693,...,-0.859185,0.803672,-0.792661,0.941227,0.038807,-0.33575,-1.109207,0.4264,1.15183,0.396533
carpal tunnel syndrome,1.113546,0.801889,-0.16502,-0.339,-1.40132,-0.825885,-1.224864,0.230387,0.477523,1.075486,...,-0.431015,2.656686,-1.755027,1.159056,-0.570832,-0.572341,0.155003,-0.527333,-1.350922,-0.264405
gastritis,-1.025944,0.108498,0.328317,-0.294424,-1.267212,-1.419151,-1.460856,1.192783,-0.183589,0.019485,...,1.252517,1.402279,1.354473,0.175315,0.654345,-0.761176,0.942745,-0.18181,0.574468,1.381791
neoplasm,0.400556,0.014828,-0.525315,0.561977,0.252577,-0.734367,0.345537,1.808464,-0.74147,0.52955,...,-0.175486,-0.224286,0.25454,-0.302883,-0.583656,0.798698,0.653916,0.55587,1.097121,-1.20567


# UMAP

In [11]:
def get_umap_proj(orig_data, options):
    umap_options = {k:v for k, v in options.items() if k in DR_OPTIONS}
    display(f'UMAP options: {umap_options}')
    umap_obj = umap.UMAP(**umap_options)
    umap_obj = umap_obj.fit(orig_data)
    umap_data = umap_obj.transform(orig_data)
    return pd.DataFrame(
        data=umap_data,
        index=orig_data.index.copy(),
        columns=[f'UMAP{i+1}' for i in range(umap_data.shape[1])]
    )

In [12]:
# for n_comp, n_neigh in product(DR_OPTIONS['n_components'], DR_OPTIONS['n_neighbors']):
for n_comp in DR_OPTIONS['n_components']:
    print(f'# components: {n_comp}')
    
    options = ALL_OPTIONS.copy()
    options['n_components'] = n_comp
    
    dr_data = get_umap_proj(data, options)
    
    display(dr_data.shape)
    assert dr_data.shape == (data.shape[0], n_comp)
    
    display(dr_data.iloc[:, 0:5].describe())
    
    # save
    output_file = Path(
        RESULTS_DIR,
        generate_result_set_name(
            options,
            prefix=f'umap-{input_filepath_stem}-',
            suffix='.pkl'
        )
    ).resolve()
    display(output_file)
    
    dr_data.to_pickle(output_file)
    
    print('\n')

# components: 5


"UMAP options: {'n_components': 5, 'metric': 'euclidean', 'n_neighbors': 15, 'random_state': 0}"

(538, 5)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,538.0,538.0,538.0,538.0,538.0
mean,11.437816,7.258334,8.553856,7.259817,10.623873
std,1.261588,0.421215,0.805494,1.063132,0.808115
min,8.840378,6.077176,7.158512,5.649707,8.937119
25%,10.357782,6.964365,7.947083,6.442074,10.053707
50%,11.921645,7.314492,8.346437,6.941426,10.778704
75%,12.38765,7.55911,9.146773,8.120754,11.241063
max,13.079304,8.221519,10.625927,9.72336,12.013026


PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/data_transformations/umap/umap-z_score_std-diseases_only-projection-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_5-n_neighbors_15-random_state_0.pkl')



# components: 10


"UMAP options: {'n_components': 10, 'metric': 'euclidean', 'n_neighbors': 15, 'random_state': 0}"

(538, 10)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,538.0,538.0,538.0,538.0,538.0
mean,9.631227,6.541865,11.12315,8.326889,11.386127
std,0.829753,0.850887,0.471234,0.471821,1.106293
min,7.541883,5.166805,10.14413,7.377304,8.961306
25%,9.00003,5.909348,10.766114,7.915151,10.426965
50%,9.93934,6.223926,11.126602,8.432833,11.875643
75%,10.247998,7.282622,11.511865,8.698094,12.165639
max,10.820827,8.287004,11.979257,9.322357,12.713916


PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/data_transformations/umap/umap-z_score_std-diseases_only-projection-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_10-n_neighbors_15-random_state_0.pkl')



# components: 20


"UMAP options: {'n_components': 20, 'metric': 'euclidean', 'n_neighbors': 15, 'random_state': 0}"

(538, 20)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,538.0,538.0,538.0,538.0,538.0
mean,7.978203,8.330149,0.669344,7.080921,8.563838
std,0.417938,0.502665,0.206541,0.903521,0.602302
min,7.138039,7.256611,0.141046,5.97237,7.230792
25%,7.68639,7.956015,0.529338,6.42532,8.061359
50%,8.074047,8.466843,0.676287,6.687538,8.798797
75%,8.286745,8.722929,0.821971,7.898454,9.002373
max,8.852757,9.152013,1.083654,9.176187,9.452862


PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/data_transformations/umap/umap-z_score_std-diseases_only-projection-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_20-n_neighbors_15-random_state_0.pkl')



# components: 30


"UMAP options: {'n_components': 30, 'metric': 'euclidean', 'n_neighbors': 15, 'random_state': 0}"

(538, 30)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,538.0,538.0,538.0,538.0,538.0
mean,7.604849,6.608826,1.008004,8.371486,8.122981
std,0.58206,0.400803,0.305905,0.281669,0.406049
min,6.189867,5.933766,0.282447,7.787603,7.245853
25%,7.154149,6.285422,0.784252,8.16962,7.813241
50%,7.822647,6.482105,0.912976,8.305836,8.270414
75%,8.02269,6.922755,1.260287,8.591158,8.409465
max,8.43646,7.45159,1.666053,9.021709,8.870045


PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/data_transformations/umap/umap-z_score_std-diseases_only-projection-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_30-n_neighbors_15-random_state_0.pkl')



# components: 40


"UMAP options: {'n_components': 40, 'metric': 'euclidean', 'n_neighbors': 15, 'random_state': 0}"

(538, 40)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,538.0,538.0,538.0,538.0,538.0
mean,8.585965,7.020704,0.568053,8.039467,7.285839
std,0.61083,0.406266,0.110515,0.410791,0.349692
min,7.104117,6.440289,0.28363,7.352534,6.69222
25%,8.1328,6.717357,0.47717,7.738438,7.039521
50%,8.859916,6.85417,0.575019,7.896289,7.155464
75%,9.013637,7.361944,0.644533,8.348965,7.540168
max,9.408032,7.891867,0.899489,9.034426,8.210412


PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/data_transformations/umap/umap-z_score_std-diseases_only-projection-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_40-n_neighbors_15-random_state_0.pkl')



# components: 50


"UMAP options: {'n_components': 50, 'metric': 'euclidean', 'n_neighbors': 15, 'random_state': 0}"

(538, 50)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,538.0,538.0,538.0,538.0,538.0
mean,9.074385,7.175883,0.768414,7.287528,6.949073
std,0.69089,0.205153,0.21676,0.587101,0.373552
min,7.521629,6.669611,0.348695,6.56672,6.225085
25%,8.473097,7.010681,0.617193,6.866107,6.662877
50%,9.409221,7.218916,0.706145,7.002647,6.862087
75%,9.573189,7.337743,0.926346,7.778113,7.274144
max,9.81866,7.557064,1.364857,8.614837,7.763055


PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/data_transformations/umap/umap-z_score_std-diseases_only-projection-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_50-n_neighbors_15-random_state_0.pkl')



