# Description

It projects input data into a UMAP representation.

# Modules loading

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
from IPython.display import display

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import conf
from utils import generate_result_set_name

# Settings

## Input data

In [3]:
INPUT_FILEPATH_STEM = 'smultixcan-efo_partial-mashr-zscores'

In [4]:
INPUT_FILEPATH = Path(
    conf.RESULTS["DATA_TRANSFORMATIONS_DIR"],
    'z_score_std',
    f'z_score_std-{INPUT_FILEPATH_STEM}.pkl',
).resolve()
display(INPUT_FILEPATH)

input_filepath_stem = INPUT_FILEPATH.stem
display(input_filepath_stem)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/data_transformations/z_score_std/z_score_std-smultixcan-efo_partial-mashr-zscores.pkl')

'z_score_std-smultixcan-efo_partial-mashr-zscores'

## Output folder

In [5]:
# output dir for this notebook
RESULTS_DIR = Path(
    conf.RESULTS["DATA_TRANSFORMATIONS_DIR"],
    'umap'
).resolve()
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

display(RESULTS_DIR)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/data_transformations/umap')

## UMAP options

In [6]:
# parameters of the dimentionality reduction steps
# note that these are the default parameters of UMAP (metric and n_neighbors)
DR_OPTIONS = {
    'n_components': [5, 10, 20, 30, 40, 50],
    'metric': 'euclidean',
    'n_neighbors': 15,
    'random_state': 0,
}

In [7]:
# dictionary containing all options/settings (used to generate filenames)
ALL_OPTIONS = DR_OPTIONS.copy()

display(ALL_OPTIONS)

{'n_components': [5, 10, 20, 30, 40, 50],
 'metric': 'euclidean',
 'n_neighbors': 15,
 'random_state': 0}

# Load input file

In [8]:
data = pd.read_pickle(INPUT_FILEPATH)

In [9]:
display(data.shape)

(3749, 6452)

In [10]:
display(data.head())

gene_name,ENSG00000183087,ENSG00000157227,ENSG00000096696,ENSG00000175130,ENSG00000113140,ENSG00000117984,ENSG00000116016,ENSG00000129116,ENSG00000134686,ENSG00000108679,...,ENSG00000111716,ENSG00000166796,ENSG00000114331,ENSG00000131584,ENSG00000165410,ENSG00000172757,ENSG00000147862,ENSG00000008323,ENSG00000167083,ENSG00000149257
100001_raw-Food_weight,0.834672,-0.969366,-0.788966,-1.236243,-1.19291,-0.859681,-0.324518,0.880799,-0.152357,-1.109631,...,-0.847934,-1.246339,-1.225673,-0.270551,-0.251339,-0.348636,-0.283083,0.54104,0.412878,0.21218
100002_raw-Energy,-0.893351,-0.906648,-1.108427,-0.633443,-1.257057,-0.975325,0.100801,-0.814622,0.9937,0.709155,...,1.039015,-0.097355,0.613387,1.297104,-0.646984,-0.642464,0.254638,-0.235577,0.390156,0.87505
100003_raw-Protein,0.724878,0.367317,-1.100323,-1.304925,-0.735474,-1.178054,0.002718,-0.151997,0.730952,0.949561,...,0.668253,0.381954,0.883388,0.028914,-1.262583,-0.547308,-0.369882,0.725386,0.969339,0.582554
100004_raw-Fat,-0.273941,-1.113389,-1.176575,-0.461884,-0.947444,-0.213364,-0.169319,-1.092854,0.7522,1.207424,...,0.724525,-0.569374,0.590451,1.656459,-0.081229,-0.290437,0.9289,-0.661822,0.709848,0.600761
100005_raw-Carbohydrate,-0.925286,-0.186918,-0.488066,-0.682023,-0.746643,-0.262771,-0.261208,-0.513372,0.385347,-0.681057,...,0.692922,-0.029716,0.655211,1.326645,-0.897059,-0.569916,1.393707,-0.189605,-0.369507,0.257153


# UMAP

**Reproducibility problem**: there seems to be a bug with UMAP in which it produces different results in different operating systems or machines: https://github.com/lmcinnes/umap/issues/153

In [11]:
from data.dimreduction import get_umap_proj

In [12]:
# Get a UMAP representation for all n_components configurations
for n_comp in DR_OPTIONS['n_components']:
    print(f'# components: {n_comp}')
    
    # prepare options of n_comp
    options = ALL_OPTIONS.copy()
    options['n_components'] = n_comp
    options = {k:v for k, v in options.items() if k in DR_OPTIONS}
    
    # get projection
    dr_data = get_umap_proj(data, options)
    
    # check data dimensionality
    display(dr_data.shape)
    assert dr_data.shape == (data.shape[0], n_comp)
    
    display(dr_data.iloc[:, 0:5].describe())
    
    # save
    output_file = Path(
        RESULTS_DIR,
        generate_result_set_name(
            options,
            prefix=f'umap-{input_filepath_stem}-',
            suffix='.pkl'
        )
    ).resolve()
    display(output_file)
    
    dr_data.to_pickle(output_file)
    
    print('\n')

# components: 5


'UMAP object: UMAP(n_components=5, random_state=0)'

(3749, 5)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,3749.0,3749.0,3749.0,3749.0,3749.0
mean,9.543938,10.16572,9.601528,11.654917,10.149116
std,0.826214,0.69455,0.751955,1.428988,1.05157
min,3.332259,-0.163783,7.697813,3.638116,2.022543
25%,9.243148,9.866829,9.157655,11.503939,9.784466
50%,9.561017,10.161626,9.663405,12.009958,10.300212
75%,9.975659,10.535581,10.080146,12.449842,10.727874
max,11.43644,11.680159,12.149063,13.714686,12.997172


PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/data_transformations/umap/umap-z_score_std-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_5-n_neighbors_15-random_state_0.pkl')



# components: 10


'UMAP object: UMAP(n_components=10, random_state=0)'

(3749, 10)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,3749.0,3749.0,3749.0,3749.0,3749.0
mean,10.055091,9.345581,1.972606,-0.810811,7.834863
std,0.759268,0.685186,0.393612,1.09404,0.664199
min,3.38035,-1.548761,0.423826,-2.543923,3.816696
25%,9.90875,9.03159,1.68795,-1.356854,7.586656
50%,10.132289,9.320099,1.986853,-1.059108,7.969265
75%,10.343075,9.655686,2.255163,-0.661593,8.226859
max,11.093286,11.379278,3.222786,6.470329,8.994748


PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/data_transformations/umap/umap-z_score_std-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_10-n_neighbors_15-random_state_0.pkl')



# components: 20


'UMAP object: UMAP(n_components=20, random_state=0)'

(3749, 20)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,3749.0,3749.0,3749.0,3749.0,3749.0
mean,12.019559,8.970558,10.794836,9.053912,7.513159
std,1.021337,0.501,0.418081,0.694764,0.545597
min,4.233697,0.318696,8.705801,4.026264,3.586336
25%,12.063798,8.796188,10.629162,8.90891,7.370019
50%,12.291521,8.970154,10.872267,9.154427,7.598104
75%,12.46997,9.19138,11.071811,9.390047,7.792161
max,13.153984,9.990442,11.732347,10.035084,8.466881


PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/data_transformations/umap/umap-z_score_std-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_20-n_neighbors_15-random_state_0.pkl')



# components: 30


'UMAP object: UMAP(n_components=30, random_state=0)'

(3749, 30)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,3749.0,3749.0,3749.0,3749.0,3749.0
mean,11.545959,10.84359,12.209373,9.651301,6.104931
std,1.023025,0.649284,0.502069,0.761134,0.265185
min,2.865891,-1.496077,8.970568,4.550162,4.628211
25%,11.548882,10.715927,12.08301,9.622404,5.966662
50%,11.773587,10.914841,12.273912,9.8849,6.13026
75%,11.946756,11.082951,12.453206,10.03687,6.274264
max,12.79746,11.672678,13.643174,10.597328,6.692077


PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/data_transformations/umap/umap-z_score_std-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_30-n_neighbors_15-random_state_0.pkl')



# components: 40


'UMAP object: UMAP(n_components=40, random_state=0)'

(3749, 40)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,3749.0,3749.0,3749.0,3749.0,3749.0
mean,12.817765,10.235202,11.222728,9.841418,6.867486
std,1.106764,0.576721,0.323367,0.769608,0.404567
min,4.066474,-1.447158,9.343712,4.563159,3.598579
25%,12.890082,10.130205,11.138888,9.822404,6.769755
50%,13.115295,10.27812,11.294712,10.043162,6.927192
75%,13.267161,10.4019,11.401079,10.202492,7.068208
max,13.704994,10.877576,11.885241,10.712289,7.504284


PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/data_transformations/umap/umap-z_score_std-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_40-n_neighbors_15-random_state_0.pkl')



# components: 50


'UMAP object: UMAP(n_components=50, random_state=0)'

(3749, 50)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,3749.0,3749.0,3749.0,3749.0,3749.0
mean,12.265738,10.170321,12.591548,9.513645,7.722247
std,0.972277,0.528322,0.646648,0.71511,0.56267
min,4.395242,-0.456861,8.75156,3.970074,3.659067
25%,12.336788,10.060452,12.609579,9.455735,7.656511
50%,12.501959,10.200792,12.806866,9.676305,7.84781
75%,12.641078,10.319852,12.907908,9.844992,7.977063
max,13.250747,10.810098,13.340292,10.55801,8.693202


PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/data_transformations/umap/umap-z_score_std-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_50-n_neighbors_15-random_state_0.pkl')



