# Description

It projects input data into a UMAP representation of **disease only** traits.

# Modules loading

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
from IPython.display import display

import pandas as pd
import umap
import matplotlib.pyplot as plt
import seaborn as sns

import conf
from utils import generate_result_set_name
from data.cache import read_data

# Settings

In [3]:
INPUT_FILEPATH = Path(
    conf.RESULTS["DATA_TRANSFORMATIONS_DIR"],
    'traits_selections',
    'diseases_only-projection-smultixcan-efo_partial-mashr-zscores.pkl'
).resolve()
display(INPUT_FILEPATH)

input_filepath_stem = INPUT_FILEPATH.stem
display(input_filepath_stem)

PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/traits_selections/diseases_only-projection-smultixcan-efo_partial-mashr-zscores.pkl')

'diseases_only-projection-smultixcan-efo_partial-mashr-zscores'

In [4]:
# number of components to use in the dimensionality reduction step
DR_OPTIONS = {
    'n_components': [5, 10, 20, 30, 40, 50],
    'metric': 'euclidean',
    'n_neighbors': 15,
    'random_state': 0,
}

In [5]:
# output dir for this notebook
RESULTS_DIR = Path(
    conf.RESULTS["DATA_TRANSFORMATIONS_DIR"],
    'umap'
).resolve()
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

display(RESULTS_DIR)

PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/umap')

In [6]:
# dictionary containing all options/settings (used to generate filenames)
ALL_OPTIONS = DR_OPTIONS.copy()
# ALL_OPTIONS['proj_percentile'] = PERCENTILE_NAME

display(ALL_OPTIONS)

{'n_components': [5, 10, 20, 30, 40, 50],
 'metric': 'euclidean',
 'n_neighbors': 15,
 'random_state': 0}

# Load input file

In [7]:
data = pd.read_pickle(INPUT_FILEPATH)

In [8]:
data.shape

(538, 987)

In [9]:
data.head()

Unnamed: 0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10,...,LV978,LV979,LV980,LV981,LV982,LV983,LV984,LV985,LV986,LV987
vascular disease AND intestinal disease,0.014921,0.007221,-0.021297,-0.013469,-0.004764,0.019406,0.034742,0.015188,-0.054239,-0.057495,...,-6e-06,0.009317,-0.021804,-0.025295,0.004658,-0.007537,-0.038976,-0.000494,-0.046299,0.039424
"osteoarthritis, knee",-0.027514,-0.011319,-0.005507,0.02466,-0.012269,-0.037501,-0.041009,-0.018378,0.05672,-0.035201,...,-0.021507,0.023317,-0.02085,0.025021,0.003739,-0.008428,-0.040916,0.012425,0.03313,0.009771
carpal tunnel syndrome,0.037605,0.026694,-0.004169,-0.009875,-0.038713,-0.019403,-0.035912,0.008916,0.009359,0.034914,...,-0.009627,0.073954,-0.045759,0.030834,-0.013835,-0.016858,0.00382,-0.012988,-0.03613,-0.011489
gastritis,-0.028098,0.006805,0.011964,-0.008576,-0.034687,-0.033875,-0.04308,0.037237,-0.006757,0.003071,...,0.037084,0.039675,0.034724,0.004581,0.021483,-0.023586,0.031696,-0.003781,0.017152,0.041463
neoplasm,0.015709,0.004118,-0.015952,0.016398,0.010938,-0.01717,0.01179,0.055354,-0.020357,0.018452,...,-0.002537,-0.004774,0.006254,-0.00818,-0.014205,0.031993,0.021475,0.015875,0.031616,-0.041767


# UMAP

In [10]:
def get_umap_proj(orig_data, options):
    umap_obj = umap.UMAP(**{k:v for k, v in options.items() if k in DR_OPTIONS})
    umap_obj = umap_obj.fit(orig_data)
    umap_data = umap_obj.transform(orig_data)
    return pd.DataFrame(
        data=umap_data,
        index=orig_data.index.copy(),
        columns=[f'UMAP{i+1}' for i in range(umap_data.shape[1])]
    )

In [11]:
# for n_comp, n_neigh in product(DR_OPTIONS['n_components'], DR_OPTIONS['n_neighbors']):
for n_comp in DR_OPTIONS['n_components']:
    print(f'# components: {n_comp}')
    
    options = ALL_OPTIONS.copy()
    options['n_components'] = n_comp
    
    dr_data = get_umap_proj(data, options)
    
    display(dr_data.shape)
    assert dr_data.shape == (data.shape[0], n_comp)
    
    display(dr_data.iloc[:, 0:5].describe())
    
    # save
    output_file = Path(
        RESULTS_DIR,
        generate_result_set_name(
            options,
            prefix=f'umap-{input_filepath_stem}-',
            suffix='.pkl'
        )
    ).resolve()
    display(output_file)
    
    dr_data.to_pickle(output_file)
    
    print('\n')

# components: 5


(538, 5)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,538.0,538.0,538.0,538.0,538.0
mean,11.69839,6.351459,6.063508,8.715043,8.815549
std,1.363299,0.579099,1.036406,0.717553,0.536417
min,8.846291,5.055324,4.505157,6.932583,7.708518
25%,10.508924,5.894459,5.286444,8.19579,8.400151
50%,12.259822,6.333099,5.699534,8.797594,8.787123
75%,12.687287,6.834639,6.943837,9.254248,9.22122
max,13.536255,7.622207,8.744159,10.114009,10.044185


PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/umap/umap-diseases_only-projection-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_5-n_neighbors_15-random_state_0.pkl')



# components: 10


(538, 10)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,538.0,538.0,538.0,538.0,538.0
mean,8.980839,5.902012,0.154338,4.647347,9.247462
std,0.737701,0.316361,0.350059,1.034498,0.41489
min,7.069083,5.15353,-0.747229,3.15873,8.433452
25%,8.400496,5.667927,-0.132426,3.889028,8.909946
50%,9.208768,5.930945,0.144319,4.22966,9.290685
75%,9.552984,6.137457,0.460004,5.498875,9.532115
max,10.100342,6.595897,0.902308,7.077068,10.25614


PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/umap/umap-diseases_only-projection-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_10-n_neighbors_15-random_state_0.pkl')



# components: 20


(538, 20)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,538.0,538.0,538.0,538.0,538.0
mean,8.674925,7.14948,2.470582,9.024168,6.822459
std,0.716936,0.155852,0.662525,0.577149,0.244387
min,7.06873,6.727529,0.8568,7.723488,6.086979
25%,8.053979,7.044779,1.878851,8.529346,6.657654
50%,8.954387,7.147669,2.75472,9.296133,6.849231
75%,9.21867,7.256415,2.965782,9.433777,7.006309
max,9.748717,7.580872,3.290337,9.760523,7.294079


PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/umap/umap-diseases_only-projection-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_20-n_neighbors_15-random_state_0.pkl')



# components: 30


(538, 30)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,538.0,538.0,538.0,538.0,538.0
mean,10.448567,5.82647,-0.323963,7.625603,9.246286
std,0.708035,0.412532,0.60385,0.168246,0.186903
min,8.654788,4.982652,-1.127046,7.165621,8.753314
25%,9.840918,5.539575,-0.741957,7.514384,9.117325
50%,10.783122,5.668615,-0.583924,7.646608,9.239881
75%,10.96366,6.191707,0.175815,7.747705,9.377318
max,11.300587,6.630699,1.137936,8.069474,9.684309


PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/umap/umap-diseases_only-projection-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_30-n_neighbors_15-random_state_0.pkl')



# components: 40


(538, 40)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,538.0,538.0,538.0,538.0,538.0
mean,8.570003,6.840653,0.012449,8.653414,7.588523
std,0.442001,0.25823,0.542797,0.354773,0.249461
min,7.511904,6.424364,-0.717533,7.707873,7.080948
25%,8.173218,6.641937,-0.379681,8.366653,7.382907
50%,8.748106,6.763127,-0.235558,8.783742,7.556041
75%,8.904224,7.071409,0.489969,8.920159,7.793006
max,9.295289,7.417861,1.281133,9.225111,8.120114


PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/umap/umap-diseases_only-projection-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_40-n_neighbors_15-random_state_0.pkl')



# components: 50


(538, 50)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,538.0,538.0,538.0,538.0,538.0
mean,8.13246,7.396153,-0.074768,9.338972,7.77172
std,0.510653,0.29692,0.577381,0.702729,0.138233
min,6.97885,6.738666,-0.898324,7.658731,7.463675
25%,7.717185,7.135725,-0.510513,8.77018,7.660825
50%,8.373206,7.498573,-0.293383,9.711115,7.772849
75%,8.495791,7.615594,0.415115,9.832593,7.882412
max,8.757478,7.887705,1.252994,10.139756,8.09804


PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/umap/umap-diseases_only-projection-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_50-n_neighbors_15-random_state_0.pkl')



