# Description

It projects input data into a UMAP representation.

# Modules loading

In [None]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
from IPython.display import display

import pandas as pd
import umap
import matplotlib.pyplot as plt
import seaborn as sns

import conf
from utils import generate_result_set_name

# Settings

In [3]:
INPUT_FILEPATH = Path(
    conf.RESULTS['PROJECTIONS_DIR'],
    'projection-smultixcan-mashr-zscores.pkl'
).resolve()
display(INPUT_FILEPATH)

input_filepath_stem = INPUT_FILEPATH.stem
display(input_filepath_stem)

PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/projections/projection-smultixcan-mashr-zscores.pkl')

'projection-smultixcan-mashr-zscores'

In [4]:
# number of components to use in the dimensionality reduction step
DR_OPTIONS = {
    'n_components': [5, 10, 20, 30, 40, 50],
    'metric': 'euclidean',
    'n_neighbors': 15,
    'random_state': 0,
}

In [5]:
# output dir for this notebook
RESULTS_DIR = Path(
    conf.RESULTS["DATA_TRANSFORMATIONS_DIR"],
    'umap'
).resolve()
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

display(RESULTS_DIR)

PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/umap')

In [6]:
# dictionary containing all options/settings (used to generate filenames)
ALL_OPTIONS = DR_OPTIONS.copy()
# ALL_OPTIONS['proj_percentile'] = PERCENTILE_NAME

display(ALL_OPTIONS)

{'n_components': [5, 10, 20, 30, 40, 50],
 'metric': 'euclidean',
 'n_neighbors': 15,
 'random_state': 0}

# Load input file

In [7]:
data = pd.read_pickle(INPUT_FILEPATH).T

In [8]:
data.shape

(4091, 987)

In [9]:
data.head()

Unnamed: 0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10,...,LV978,LV979,LV980,LV981,LV982,LV983,LV984,LV985,LV986,LV987
20096_1-Size_of_red_wine_glass_drunk_small_125ml,-0.026255,-0.006253,-0.002729,0.042319,-0.025975,0.022122,0.004249,-0.015139,-0.001912,-0.021863,...,-0.024305,-0.021982,-0.020533,-0.001901,-0.019043,0.006065,-0.042334,-0.025743,0.003379,0.003053
2345-Ever_had_bowel_cancer_screening,0.006045,-0.023996,0.051637,-0.023236,0.002167,-0.009903,-0.036104,0.008751,0.004731,0.035955,...,0.030414,0.015316,0.023236,0.04067,0.002873,0.007549,0.043547,-0.037085,0.043986,0.00123
N49-Diagnoses_main_ICD10_N49_Inflammatory_disorders_of_male_genital_organs_not_elsewhere_classified,0.007276,0.037932,-0.013182,0.063016,-0.022646,0.013733,0.022293,-0.020302,-0.005819,-0.026274,...,0.002066,-0.006492,-0.003635,0.017402,-0.034985,0.032009,-0.009513,0.020852,0.005649,-0.00351
100011_raw-Iron,-0.02974,-1.8e-05,-0.001663,-0.002628,0.004559,0.026327,0.000334,0.05785,-0.011335,0.036659,...,0.017457,0.026494,-0.018653,0.016464,0.013154,-0.022661,0.013973,0.033139,-0.049358,-0.030986
5221-Index_of_best_refractometry_result_right,-0.001949,-0.016128,0.035221,-0.046487,0.004649,-0.032353,0.002471,0.006538,0.001404,0.039243,...,0.038107,-0.005864,-0.011039,0.013418,-0.010531,-0.026655,-0.020884,-0.024993,0.011461,0.016251


# UMAP

In [10]:
def get_umap_proj(orig_data, options):
    umap_obj = umap.UMAP(**{k:v for k, v in options.items() if k in DR_OPTIONS})
    umap_obj = umap_obj.fit(orig_data)
    umap_data = umap_obj.transform(orig_data)
    return pd.DataFrame(
        data=umap_data,
        index=orig_data.index.copy(),
        columns=[f'UMAP{i+1}' for i in range(umap_data.shape[1])]
    )

In [11]:
# for n_comp, n_neigh in product(DR_OPTIONS['n_components'], DR_OPTIONS['n_neighbors']):
for n_comp in DR_OPTIONS['n_components']:
    print(f'# components: {n_comp}')
    
    options = ALL_OPTIONS.copy()
    options['n_components'] = n_comp
    
    dr_data = get_umap_proj(data, options)
    
    display(dr_data.shape)
    assert dr_data.shape == (data.shape[0], n_comp)
    
    display(dr_data.iloc[:, 0:5].describe())
    
    # save
    output_file = Path(
        RESULTS_DIR,
        generate_result_set_name(
            options,
            prefix=f'umap-{input_filepath_stem}-',
            suffix='.pkl'
        )
    ).resolve()
    display(output_file)
    
    dr_data.to_pickle(output_file)
    
    print('\n')

# components: 5


(4091, 5)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,4091.0,4091.0,4091.0,4091.0,4091.0
mean,15.092049,-0.340451,-4.964131,14.045987,14.192043
std,1.31654,0.76926,1.012424,1.112881,0.753357
min,4.432586,-2.033457,-7.016301,5.509353,9.877634
25%,14.813584,-0.757855,-5.511229,13.744675,13.829367
50%,15.27492,-0.367994,-5.082349,14.125525,14.244704
75%,15.698419,0.025582,-4.611725,14.593624,14.684642
max,17.144167,11.622155,5.723032,15.972184,16.314999


PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/umap/umap-projection-smultixcan-mashr-zscores-metric_euclidean-n_components_5-n_neighbors_15-random_state_0.pkl')



# components: 10


(4091, 10)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,4091.0,4091.0,4091.0,4091.0,4091.0
mean,14.114046,-1.962394,-2.638373,6.341091,2.594998
std,1.100211,0.730504,0.613221,0.42021,0.515559
min,4.675219,-3.127305,-3.985117,4.532813,-0.371966
25%,13.946414,-2.27897,-2.929388,6.054004,2.376227
50%,14.251517,-1.990373,-2.673989,6.329113,2.66652
75%,14.516145,-1.701694,-2.435909,6.63899,2.924531
max,15.790355,11.241449,5.536536,7.376374,3.783902


PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/umap/umap-projection-smultixcan-mashr-zscores-metric_euclidean-n_components_10-n_neighbors_15-random_state_0.pkl')



# components: 20


(4091, 20)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,4091.0,4091.0,4091.0,4091.0,4091.0
mean,12.379113,11.666807,10.714803,4.733529,8.373384
std,0.745161,0.680411,0.452666,0.285059,0.299464
min,5.762298,-1.508844,4.444713,3.996361,7.587744
25%,12.261736,11.546385,10.560979,4.547041,8.172793
50%,12.446454,11.737701,10.738589,4.738097,8.341006
75%,12.63848,11.89872,10.931459,4.92219,8.528892
max,13.892657,12.458995,11.64579,5.693902,10.024713


PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/umap/umap-projection-smultixcan-mashr-zscores-metric_euclidean-n_components_20-n_neighbors_15-random_state_0.pkl')



# components: 30


(4091, 30)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,4091.0,4091.0,4091.0,4091.0,4091.0
mean,13.796386,11.985489,10.461631,6.011446,9.567186
std,0.970103,0.640118,0.515074,0.274747,0.214855
min,4.792209,-0.826455,2.43502,4.366706,8.867124
25%,13.736948,11.888051,10.368342,5.831102,9.439028
50%,13.922606,12.017317,10.538025,6.019228,9.563671
75%,14.086956,12.157341,10.667516,6.209161,9.695908
max,14.56008,13.214928,11.051181,7.344556,10.415008


PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/umap/umap-projection-smultixcan-mashr-zscores-metric_euclidean-n_components_30-n_neighbors_15-random_state_0.pkl')



# components: 40


(4091, 40)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,4091.0,4091.0,4091.0,4091.0,4091.0
mean,14.027572,10.882144,10.095325,6.049903,6.003493
std,0.914546,0.597263,0.466466,0.201895,0.518032
min,5.551249,-1.270937,2.817276,5.125857,5.263468
25%,14.038877,10.794255,9.986335,5.950322,5.760055
50%,14.157476,10.929843,10.144741,6.075797,5.899253
75%,14.262088,11.0623,10.282416,6.182686,6.067422
max,14.741997,11.541875,10.889355,6.557185,10.079408


PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/umap/umap-projection-smultixcan-mashr-zscores-metric_euclidean-n_components_40-n_neighbors_15-random_state_0.pkl')



# components: 50


(4091, 50)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,4091.0,4091.0,4091.0,4091.0,4091.0
mean,13.444895,8.559925,9.117065,5.899381,10.957607
std,0.915052,0.525422,0.41282,0.196807,0.252918
min,5.24957,-2.049825,2.373647,4.773959,9.093416
25%,13.425783,8.437731,9.00782,5.793877,10.841897
50%,13.569436,8.569058,9.131269,5.894562,10.977713
75%,13.717456,8.696826,9.250693,6.014483,11.110442
max,14.121567,9.90784,11.026827,6.498401,11.667341


PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/umap/umap-projection-smultixcan-mashr-zscores-metric_euclidean-n_components_50-n_neighbors_15-random_state_0.pkl')



