# Description

It projects input data into a UMAP representation.

# Modules loading

In [None]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
from IPython.display import display

import pandas as pd
import umap
import matplotlib.pyplot as plt
import seaborn as sns

import conf
from utils import generate_result_set_name

# Settings

In [3]:
INPUT_FILEPATH = Path(
    conf.PHENOMEXCAN["SMULTIXCAN_MASHR_ZSCORES_FILE"]
).resolve()
display(INPUT_FILEPATH)

input_filepath_stem = INPUT_FILEPATH.stem
display(input_filepath_stem)

PosixPath('/media/miltondp/Elements1/projects/phenoplier/data/phenomexcan/gene_assoc/smultixcan-mashr-zscores.pkl')

'smultixcan-mashr-zscores'

In [4]:
# number of components to use in the dimensionality reduction step
DR_OPTIONS = {
    'n_components': [5, 10, 20, 30, 40, 50],
    'metric': 'euclidean',
    'n_neighbors': 15,
    'random_state': 0,
}

In [5]:
# output dir for this notebook
RESULTS_DIR = Path(
    conf.RESULTS["DATA_TRANSFORMATIONS_DIR"],
    'umap'
).resolve()
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

display(RESULTS_DIR)

PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/umap')

In [6]:
# dictionary containing all options/settings (used to generate filenames)
ALL_OPTIONS = DR_OPTIONS.copy()
# ALL_OPTIONS['proj_percentile'] = PERCENTILE_NAME

display(ALL_OPTIONS)

{'n_components': [5, 10, 20, 30, 40, 50],
 'metric': 'euclidean',
 'n_neighbors': 15,
 'random_state': 0}

# Load input file

In [7]:
data = pd.read_pickle(INPUT_FILEPATH).dropna(how='any').T

In [8]:
data.shape

(4091, 22152)

In [9]:
data.head()

gene_name,ENSG00000000419,ENSG00000000457,ENSG00000000460,ENSG00000000938,ENSG00000000971,ENSG00000001036,ENSG00000001084,ENSG00000001167,ENSG00000001460,ENSG00000001461,...,ENSG00000284240,ENSG00000284308,ENSG00000284395,ENSG00000284413,ENSG00000284418,ENSG00000284430,ENSG00000284452,ENSG00000284513,ENSG00000284526,ENSG00000284552
20096_1-Size_of_red_wine_glass_drunk_small_125ml,0.169468,1.358856,0.151008,1.302722,1.338813,0.345058,1.830607,1.52878,1.115171,0.864424,...,0.79377,1.848458,0.446812,0.699852,0.359617,1.166838,1.320478,1.116961,0.402507,0.155576
2345-Ever_had_bowel_cancer_screening,0.102558,1.846875,1.173202,0.841524,0.262339,0.197471,0.958805,0.952896,0.181759,0.124482,...,0.085738,1.257582,0.206341,2.034085,0.510161,1.940972,0.62233,0.297,1.662981,1.060303
N49-Diagnoses_main_ICD10_N49_Inflammatory_disorders_of_male_genital_organs_not_elsewhere_classified,0.239545,0.139324,1.179426,1.578926,0.689379,2.261729,1.020002,0.728584,2.12916,2.026659,...,0.249824,0.387018,0.795947,0.728276,0.237085,0.165581,1.817328,0.320781,0.632502,0.737441
100011_raw-Iron,0.887758,0.12953,0.571656,0.72134,1.702019,0.449356,0.30142,1.370224,0.966131,0.944866,...,0.463055,0.111857,0.542678,1.720391,0.014779,0.023481,0.736376,1.039808,0.710629,1.165912
5221-Index_of_best_refractometry_result_right,1.313448,0.757757,0.098771,0.139314,0.325859,0.101714,1.210617,0.436179,0.511192,0.236261,...,0.205661,3.780623,1.598964,1.20557,1.990288,0.420617,0.786033,0.781609,0.169048,0.139583


# UMAP

In [10]:
def get_umap_proj(orig_data, options):
    umap_obj = umap.UMAP(**{k:v for k, v in options.items() if k in DR_OPTIONS})
    umap_obj = umap_obj.fit(orig_data)
    umap_data = umap_obj.transform(orig_data)
    return pd.DataFrame(
        data=umap_data,
        index=orig_data.index.copy(),
        columns=[f'UMAP{i+1}' for i in range(umap_data.shape[1])]
    )

In [11]:
# for n_comp, n_neigh in product(DR_OPTIONS['n_components'], DR_OPTIONS['n_neighbors']):
for n_comp in DR_OPTIONS['n_components']:
    print(f'# components: {n_comp}')
    
    options = ALL_OPTIONS.copy()
    options['n_components'] = n_comp
    
    dr_data = get_umap_proj(data, options)
    
    display(dr_data.shape)
    assert dr_data.shape == (data.shape[0], n_comp)
    
    display(dr_data.iloc[:, 0:5].describe())
    
    # save
    output_file = Path(
        RESULTS_DIR,
        generate_result_set_name(
            options,
            prefix=f'umap-{input_filepath_stem}-',
            suffix='.pkl'
        )
    ).resolve()
    display(output_file)
    
    dr_data.to_pickle(output_file)
    
    print('\n')

# components: 5


(4091, 5)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,4091.0,4091.0,4091.0,4091.0,4091.0
mean,-4.237659,-0.75066,7.940552,18.266495,-1.793469
std,1.286076,0.890858,0.697343,1.594852,1.22031
min,-6.137083,-2.587179,1.094692,7.07212,-4.371616
25%,-4.859378,-1.190252,7.566836,17.972799,-2.416562
50%,-4.402143,-0.727043,7.956556,18.588083,-1.972375
75%,-3.900621,-0.378708,8.320815,19.048976,-1.466552
max,6.39168,14.667519,10.150544,20.937714,6.267581


PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/umap/umap-smultixcan-mashr-zscores-metric_euclidean-n_components_5-n_neighbors_15-random_state_0.pkl')



# components: 10


(4091, 10)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,4091.0,4091.0,4091.0,4091.0,4091.0
mean,10.330838,14.435884,-0.557826,-7.397826,8.639183
std,0.710966,1.096027,0.68881,1.224076,0.556405
min,4.408261,-1.590397,-1.801302,-8.801149,4.894973
25%,10.150208,14.223848,-0.898133,-7.919326,8.419196
50%,10.411121,14.645886,-0.658852,-7.664356,8.702322
75%,10.659261,14.956228,-0.313166,-7.267048,8.957866
max,11.924663,15.845276,8.66532,1.546034,10.047578


PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/umap/umap-smultixcan-mashr-zscores-metric_euclidean-n_components_10-n_neighbors_15-random_state_0.pkl')



# components: 20


(4091, 20)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,4091.0,4091.0,4091.0,4091.0,4091.0
mean,11.842308,6.747553,6.98209,12.935176,6.810058
std,0.795518,0.683655,0.533145,0.702544,0.318004
min,5.021392,-3.132912,0.579774,8.217664,5.667332
25%,11.73497,6.472368,6.823472,12.843904,6.608497
50%,11.977609,6.679616,6.964615,13.100359,6.789797
75%,12.177758,6.915837,7.131835,13.304968,7.006941
max,12.895435,10.347921,10.522081,13.946968,8.03584


PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/umap/umap-smultixcan-mashr-zscores-metric_euclidean-n_components_20-n_neighbors_15-random_state_0.pkl')



# components: 30


(4091, 30)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,4091.0,4091.0,4091.0,4091.0,4091.0
mean,11.622942,7.973876,7.436134,14.498184,6.950595
std,0.789581,0.561389,0.416959,0.953581,0.352187
min,4.623033,-2.416756,2.568733,7.579064,4.822701
25%,11.540437,7.795347,7.303968,14.43781,6.797884
50%,11.755882,7.956502,7.40351,14.70629,7.012434
75%,11.914007,8.140062,7.527861,14.887749,7.152408
max,12.754738,10.169818,10.160565,15.891395,7.788733


PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/umap/umap-smultixcan-mashr-zscores-metric_euclidean-n_components_30-n_neighbors_15-random_state_0.pkl')



# components: 40


(4091, 40)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,4091.0,4091.0,4091.0,4091.0,4091.0
mean,10.59211,9.236764,6.653019,14.376987,7.209499
std,0.653411,0.513261,0.552771,0.887772,0.32216
min,4.32515,-1.337941,1.015203,7.845274,5.240104
25%,10.538527,9.125125,6.43189,14.35982,7.109197
50%,10.644838,9.254277,6.580098,14.564935,7.258442
75%,10.763435,9.370437,6.776531,14.714285,7.392494
max,11.3732,10.276183,9.867143,15.522955,7.93973


PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/umap/umap-smultixcan-mashr-zscores-metric_euclidean-n_components_40-n_neighbors_15-random_state_0.pkl')



# components: 50


(4091, 50)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,4091.0,4091.0,4091.0,4091.0,4091.0
mean,10.637209,10.132711,9.082889,15.716043,6.231224
std,0.804302,0.586793,0.367086,1.067865,0.207709
min,3.028265,-2.351551,3.29264,8.376385,4.680161
25%,10.580744,10.046833,8.983984,15.671436,6.164224
50%,10.735858,10.158447,9.097907,16.002615,6.242078
75%,10.893124,10.280313,9.193085,16.183266,6.319928
max,11.434814,10.836115,10.090738,16.825451,6.763731


PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/umap/umap-smultixcan-mashr-zscores-metric_euclidean-n_components_50-n_neighbors_15-random_state_0.pkl')



