# Description

It projects input data into a UMAP representation of **disease only** traits.

# Modules loading

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
from IPython.display import display

import pandas as pd
import umap
import matplotlib.pyplot as plt
import seaborn as sns

import conf
from utils import generate_result_set_name
from data.cache import read_data

# Settings

In [3]:
INPUT_FILEPATH = Path(
    conf.RESULTS["DATA_TRANSFORMATIONS_DIR"],
    'traits_selections',
    'diseases_only-smultixcan-efo_partial-mashr-zscores.pkl'
).resolve()
display(INPUT_FILEPATH)

input_filepath_stem = INPUT_FILEPATH.stem
display(input_filepath_stem)

PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/traits_selections/diseases_only-smultixcan-efo_partial-mashr-zscores.pkl')

'diseases_only-smultixcan-efo_partial-mashr-zscores'

In [4]:
# number of components to use in the dimensionality reduction step
DR_OPTIONS = {
    'n_components': [5, 10, 20, 30, 40, 50],
    'metric': 'euclidean',
    'n_neighbors': 15,
    'random_state': 0,
}

In [5]:
# output dir for this notebook
RESULTS_DIR = Path(
    conf.RESULTS["DATA_TRANSFORMATIONS_DIR"],
    'umap'
).resolve()
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

display(RESULTS_DIR)

PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/umap')

In [6]:
# dictionary containing all options/settings (used to generate filenames)
ALL_OPTIONS = DR_OPTIONS.copy()
# ALL_OPTIONS['proj_percentile'] = PERCENTILE_NAME

display(ALL_OPTIONS)

{'n_components': [5, 10, 20, 30, 40, 50],
 'metric': 'euclidean',
 'n_neighbors': 15,
 'random_state': 0}

# Load input file

In [7]:
data = pd.read_pickle(INPUT_FILEPATH)

In [8]:
data.shape

(538, 22515)

In [9]:
data.head()

gene_name,ENSG00000000419,ENSG00000000457,ENSG00000000460,ENSG00000000938,ENSG00000000971,ENSG00000001036,ENSG00000001084,ENSG00000001167,ENSG00000001460,ENSG00000001461,...,ENSG00000284240,ENSG00000284308,ENSG00000284395,ENSG00000284413,ENSG00000284418,ENSG00000284430,ENSG00000284452,ENSG00000284513,ENSG00000284526,ENSG00000284552
vascular disease AND intestinal disease,1.463849,0.295149,0.862394,0.64243,2.560404,0.276682,1.25269,0.138074,0.253713,0.13149,...,0.238496,1.582408,0.470057,1.212982,1.320907,1.348926,0.925907,0.648023,3.172445,0.658885
"osteoarthritis, knee",1.139573,0.339193,0.063113,0.673883,1.480963,0.029437,0.738344,0.597559,0.310168,0.262036,...,0.144221,0.558024,1.497799,3.391703,0.745621,0.681965,0.577402,0.41246,0.24563,1.40189
carpal tunnel syndrome,0.91841,0.592007,0.221104,1.875068,0.427498,1.564119,1.739466,0.369079,0.795678,0.811066,...,0.326533,1.521576,0.307704,0.155117,1.459384,0.813065,0.526617,0.618006,0.384546,1.114975
gastritis,1.616585,0.675119,0.847838,0.976489,0.275587,0.13709,0.246508,1.27692,0.36882,0.205764,...,2.220743,0.934335,0.858603,0.42243,0.221982,0.083101,0.909691,0.539585,1.158575,0.101605
neoplasm,1.445983,2.245419,1.185771,1.236719,0.298348,0.987865,0.670254,0.563399,1.70164,1.710727,...,1.933531,0.586532,0.324582,0.557693,0.986534,1.205885,1.061329,0.743238,0.591596,0.730569


# UMAP

In [10]:
def get_umap_proj(orig_data, options):
    umap_obj = umap.UMAP(**{k:v for k, v in options.items() if k in DR_OPTIONS})
    umap_obj = umap_obj.fit(orig_data)
    umap_data = umap_obj.transform(orig_data)
    return pd.DataFrame(
        data=umap_data,
        index=orig_data.index.copy(),
        columns=[f'UMAP{i+1}' for i in range(umap_data.shape[1])]
    )

In [11]:
# for n_comp, n_neigh in product(DR_OPTIONS['n_components'], DR_OPTIONS['n_neighbors']):
for n_comp in DR_OPTIONS['n_components']:
    print(f'# components: {n_comp}')
    
    options = ALL_OPTIONS.copy()
    options['n_components'] = n_comp
    
    dr_data = get_umap_proj(data, options)
    
    display(dr_data.shape)
    assert dr_data.shape == (data.shape[0], n_comp)
    
    display(dr_data.iloc[:, 0:5].describe())
    
    # save
    output_file = Path(
        RESULTS_DIR,
        generate_result_set_name(
            options,
            prefix=f'umap-{input_filepath_stem}-',
            suffix='.pkl'
        )
    ).resolve()
    display(output_file)
    
    dr_data.to_pickle(output_file)
    
    print('\n')

# components: 5


(538, 5)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,538.0,538.0,538.0,538.0,538.0
mean,2.523457,3.089354,3.417991,4.47446,3.45039
std,1.162074,0.721358,0.487387,0.915946,1.660142
min,0.855264,1.724245,2.315415,2.884456,1.264891
25%,1.587472,2.528807,3.048225,3.754555,2.16934
50%,2.136133,2.922859,3.453656,4.235963,2.662531
75%,3.459928,3.708803,3.838831,5.314695,5.18699
max,5.563294,4.70571,4.352755,6.59104,6.600336


PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/umap/umap-diseases_only-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_5-n_neighbors_15-random_state_0.pkl')



# components: 10


(538, 10)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,538.0,538.0,538.0,538.0,538.0
mean,4.92622,5.74477,9.394354,7.357299,4.857917
std,1.747943,0.525511,0.351106,0.832396,0.328
min,1.212473,4.765693,8.447693,6.044429,4.225317
25%,3.24409,5.447582,9.137399,6.716235,4.636795
50%,5.905577,5.671469,9.409024,7.02895,4.785826
75%,6.261049,5.89141,9.619931,8.124026,5.025529
max,6.991565,7.616616,10.187648,8.976813,5.864987


PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/umap/umap-diseases_only-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_10-n_neighbors_15-random_state_0.pkl')



# components: 20


(538, 20)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,538.0,538.0,538.0,538.0,538.0
mean,6.195287,6.430822,8.128351,6.301155,5.474097
std,1.068121,0.150033,0.403351,0.436417,0.242871
min,3.242134,5.968815,7.353062,5.370771,4.884989
25%,5.270771,6.326126,7.854867,5.932114,5.254771
50%,6.722421,6.411115,8.020808,6.434557,5.511577
75%,6.99314,6.537681,8.38078,6.626839,5.661481
max,7.368199,6.930509,9.496861,7.135594,5.950854


PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/umap/umap-diseases_only-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_20-n_neighbors_15-random_state_0.pkl')



# components: 30


(538, 30)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,538.0,538.0,538.0,538.0,538.0
mean,6.809592,6.342454,7.944166,5.785023,4.653926
std,0.374727,0.606298,0.295185,0.676647,0.157014
min,5.525477,5.043921,7.324778,4.245417,4.285637
25%,6.625212,5.814288,7.720852,5.150887,4.540144
50%,6.892077,6.657752,7.906626,6.17328,4.637617
75%,7.067567,6.780429,8.170188,6.289493,4.762765
max,7.462528,7.17296,8.731231,6.589104,5.090665


PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/umap/umap-diseases_only-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_30-n_neighbors_15-random_state_0.pkl')



# components: 40


(538, 40)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,538.0,538.0,538.0,538.0,538.0
mean,6.573172,6.238102,7.212529,6.606807,5.695101
std,0.413837,0.156537,0.157814,0.525132,0.419085
min,5.567856,5.952479,6.747273,5.539504,5.055376
25%,6.205609,6.099872,7.1056,6.151641,5.40254
50%,6.742789,6.224386,7.196656,6.832913,5.524118
75%,6.879238,6.369987,7.314595,7.027964,6.003917
max,7.180971,6.649337,7.663711,7.253455,6.795944


PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/umap/umap-diseases_only-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_40-n_neighbors_15-random_state_0.pkl')



# components: 50


(538, 50)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,538.0,538.0,538.0,538.0,538.0
mean,5.948809,6.838124,8.149352,7.036644,4.756956
std,0.332564,0.382238,0.446649,0.119262,0.187734
min,4.705698,6.204206,7.531398,6.760604,4.420199
25%,5.844308,6.54277,7.804341,6.955519,4.598333
50%,6.01135,6.688265,7.922051,7.032018,4.729662
75%,6.127545,7.208886,8.629054,7.110879,4.938062
max,6.634403,7.743271,9.130408,7.384448,5.097365


PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/umap/umap-diseases_only-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_50-n_neighbors_15-random_state_0.pkl')



