# Description

It projects input data into a UMAP representation.

# Modules loading

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
from IPython.display import display

import pandas as pd
import umap
import matplotlib.pyplot as plt
import seaborn as sns

import conf
from utils import generate_result_set_name

# Settings

In [3]:
INPUT_FILEPATH = Path(
    conf.PHENOMEXCAN["SMULTIXCAN_EFO_PARTIAL_MASHR_ZSCORES_FILE"]
).resolve()
display(INPUT_FILEPATH)

input_filepath_stem = INPUT_FILEPATH.stem
display(input_filepath_stem)

PosixPath('/media/miltondp/Elements1/projects/phenoplier/data/phenomexcan/gene_assoc/smultixcan-efo_partial-mashr-zscores.pkl')

'smultixcan-efo_partial-mashr-zscores'

In [4]:
# number of components to use in the dimensionality reduction step
DR_OPTIONS = {
    'n_components': [5, 10, 20, 30, 40, 50],
    'metric': 'euclidean',
    'n_neighbors': 15,
    'random_state': 0,
}

In [5]:
# output dir for this notebook
RESULTS_DIR = Path(
    conf.RESULTS["DATA_TRANSFORMATIONS_DIR"],
    'umap'
).resolve()
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

display(RESULTS_DIR)

PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/umap')

In [6]:
# dictionary containing all options/settings (used to generate filenames)
ALL_OPTIONS = DR_OPTIONS.copy()
# ALL_OPTIONS['proj_percentile'] = PERCENTILE_NAME

display(ALL_OPTIONS)

{'n_components': [5, 10, 20, 30, 40, 50],
 'metric': 'euclidean',
 'n_neighbors': 15,
 'random_state': 0}

# Load input file

In [7]:
data = pd.read_pickle(INPUT_FILEPATH).T

In [8]:
data.shape

(3749, 22515)

In [9]:
data.head()

gene_name,ENSG00000000419,ENSG00000000457,ENSG00000000460,ENSG00000000938,ENSG00000000971,ENSG00000001036,ENSG00000001084,ENSG00000001167,ENSG00000001460,ENSG00000001461,...,ENSG00000284240,ENSG00000284308,ENSG00000284395,ENSG00000284413,ENSG00000284418,ENSG00000284430,ENSG00000284452,ENSG00000284513,ENSG00000284526,ENSG00000284552
100001_raw-Food_weight,1.145442,0.618066,0.515724,0.280781,0.548127,0.045587,0.329995,0.109494,1.356741,1.474255,...,1.49041,0.230329,0.596503,0.519733,2.285074,0.12498,1.587903,1.522281,0.150938,1.010143
100002_raw-Energy,0.724557,1.028131,0.403596,0.25391,0.389877,0.197393,0.669649,0.04101,0.83212,0.954183,...,1.165679,0.111142,0.084263,1.229913,0.39019,0.505316,0.975901,1.817369,0.756393,0.729526
100003_raw-Protein,0.090876,2.21842,1.251359,0.879148,0.723469,0.777974,0.207873,0.536609,0.453969,1.286942,...,0.54137,0.734872,0.634674,1.31675,0.761859,1.276888,0.160988,0.346794,0.609476,0.222126
100004_raw-Fat,0.298165,0.762584,0.433091,0.352705,1.16725,0.578435,0.738983,0.565245,0.397189,0.192279,...,0.867217,0.540941,0.284347,1.661131,0.404078,1.248959,0.799771,1.443097,0.814969,0.545356
100005_raw-Carbohydrate,1.134347,0.934418,0.413466,0.051846,0.315952,0.046237,1.113674,0.319842,0.965217,0.919779,...,1.747265,0.496178,0.144053,0.701817,0.827677,0.587188,1.089338,2.001502,1.362716,1.49003


# UMAP

In [10]:
def get_umap_proj(orig_data, options):
    umap_obj = umap.UMAP(**{k:v for k, v in options.items() if k in DR_OPTIONS})
    umap_obj = umap_obj.fit(orig_data)
    umap_data = umap_obj.transform(orig_data)
    return pd.DataFrame(
        data=umap_data,
        index=orig_data.index.copy(),
        columns=[f'UMAP{i+1}' for i in range(umap_data.shape[1])]
    )

In [11]:
# for n_comp, n_neigh in product(DR_OPTIONS['n_components'], DR_OPTIONS['n_neighbors']):
for n_comp in DR_OPTIONS['n_components']:
    print(f'# components: {n_comp}')
    
    options = ALL_OPTIONS.copy()
    options['n_components'] = n_comp
    
    dr_data = get_umap_proj(data, options)
    
    display(dr_data.shape)
    assert dr_data.shape == (data.shape[0], n_comp)
    
    display(dr_data.iloc[:, 0:5].describe())
    
    # save
    output_file = Path(
        RESULTS_DIR,
        generate_result_set_name(
            options,
            prefix=f'umap-{input_filepath_stem}-',
            suffix='.pkl'
        )
    ).resolve()
    display(output_file)
    
    dr_data.to_pickle(output_file)
    
    print('\n')

# components: 5


(3749, 5)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,3749.0,3749.0,3749.0,3749.0,3749.0
mean,3.252615,0.633389,-4.734339,-6.253658,-0.160677
std,0.977179,0.82207,1.435681,1.466053,0.687564
min,0.249835,-1.61527,-7.048862,-9.041843,-2.109535
25%,2.856221,0.259234,-5.707329,-6.932757,-0.543586
50%,3.185971,0.610978,-5.213283,-6.504684,-0.19617
75%,3.519284,0.985862,-4.235223,-5.860087,0.16188
max,11.483977,14.233703,4.708987,4.364089,3.257919


PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/umap/umap-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_5-n_neighbors_15-random_state_0.pkl')



# components: 10


(3749, 10)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,3749.0,3749.0,3749.0,3749.0,3749.0
mean,-3.385514,11.781983,11.702998,-1.191766,-0.269153
std,1.601074,0.776171,0.720756,0.643258,0.460902
min,-4.901329,-0.788012,4.306539,-3.431091,-1.618921
25%,-4.229977,11.54525,11.494916,-1.506473,-0.553043
50%,-3.89036,11.879045,11.837508,-1.227348,-0.307382
75%,-3.05437,12.151494,12.120855,-0.967469,-0.036653
max,8.238693,13.077419,12.994085,2.677575,2.13396


PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/umap/umap-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_10-n_neighbors_15-random_state_0.pkl')



# components: 20


(3749, 20)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,3749.0,3749.0,3749.0,3749.0,3749.0
mean,-0.992039,0.65144,-2.236039,-0.004241,0.547745
std,1.111902,0.537406,0.588273,0.476805,0.363006
min,-2.297202,-0.105243,-3.542164,-1.805669,-0.485184
25%,-1.412976,0.489357,-2.555759,-0.166228,0.360743
50%,-1.197351,0.652379,-2.334689,0.061306,0.536027
75%,-0.8942,0.764171,-2.032815,0.229995,0.703449
max,8.83723,11.313079,4.504565,2.601645,2.839957


PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/umap/umap-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_20-n_neighbors_15-random_state_0.pkl')



# components: 30


(3749, 30)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,3749.0,3749.0,3749.0,3749.0,3749.0
mean,12.507243,10.855247,10.003415,11.732469,10.535384
std,1.241598,0.648087,0.406464,0.60636,0.398314
min,2.050032,-1.269892,3.819613,7.413626,8.536633
25%,12.494047,10.688519,9.893262,11.502682,10.414731
50%,12.839887,10.940052,10.022593,11.675805,10.617169
75%,12.990063,11.108652,10.157248,11.89528,10.754359
max,13.477344,11.623949,10.81735,13.435103,11.403426


PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/umap/umap-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_30-n_neighbors_15-random_state_0.pkl')



# components: 40


(3749, 40)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,3749.0,3749.0,3749.0,3749.0,3749.0
mean,11.396314,11.343591,11.401268,11.153778,12.100967
std,1.029006,0.734122,0.468746,0.555436,0.580361
min,2.070374,-2.6108,5.130984,8.074213,8.974076
25%,11.348479,11.23163,11.310749,10.894485,12.025827
50%,11.599537,11.460111,11.466979,11.070588,12.27028
75%,11.744359,11.607738,11.595957,11.315697,12.40395
max,12.110102,12.068358,12.009825,13.047127,12.91175


PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/umap/umap-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_40-n_neighbors_15-random_state_0.pkl')



# components: 50


(3749, 50)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,3749.0,3749.0,3749.0,3749.0,3749.0
mean,11.674576,10.371646,11.158474,11.864619,9.776398
std,1.005584,0.591366,0.503599,0.649769,0.332357
min,2.541232,-1.669116,4.016442,7.441638,7.50904
25%,11.611922,10.280213,11.057518,11.672916,9.677867
50%,11.862285,10.437992,11.26262,11.831642,9.824313
75%,12.010863,10.540978,11.36944,12.033896,9.927349
max,12.800804,10.840425,11.896875,13.577888,10.520713


PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/umap/umap-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_50-n_neighbors_15-random_state_0.pkl')



