# Description

It projects input data into a UMAP representation.

# Modules loading

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
from IPython.display import display

import pandas as pd
import umap
import matplotlib.pyplot as plt
import seaborn as sns

import conf
from utils import generate_result_set_name

# Settings

In [3]:
INPUT_FILEPATH = Path(
    conf.RESULTS['PROJECTIONS_DIR'],
    'projection-smultixcan-efo_partial-mashr-zscores.pkl'
).resolve()
display(INPUT_FILEPATH)

input_filepath_stem = INPUT_FILEPATH.stem
display(input_filepath_stem)

PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/projections/projection-smultixcan-efo_partial-mashr-zscores.pkl')

'projection-smultixcan-efo_partial-mashr-zscores'

In [4]:
# number of components to use in the dimensionality reduction step
DR_OPTIONS = {
    'n_components': [5, 10, 20, 30, 40, 50],
    'metric': 'euclidean',
    'n_neighbors': 15,
    'random_state': 0,
}

In [5]:
# output dir for this notebook
RESULTS_DIR = Path(
    conf.RESULTS["DATA_TRANSFORMATIONS_DIR"],
    'umap'
).resolve()
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

display(RESULTS_DIR)

PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/umap')

In [6]:
# dictionary containing all options/settings (used to generate filenames)
ALL_OPTIONS = DR_OPTIONS.copy()
# ALL_OPTIONS['proj_percentile'] = PERCENTILE_NAME

display(ALL_OPTIONS)

{'n_components': [5, 10, 20, 30, 40, 50],
 'metric': 'euclidean',
 'n_neighbors': 15,
 'random_state': 0}

# Load input file

In [7]:
data = pd.read_pickle(INPUT_FILEPATH).T

In [8]:
data.shape

(3749, 987)

In [9]:
data.head()

Unnamed: 0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10,...,LV978,LV979,LV980,LV981,LV982,LV983,LV984,LV985,LV986,LV987
100001_raw-Food_weight,-0.018452,0.052938,-0.003629,0.028359,-0.0155,0.035854,-0.015439,0.023007,0.017368,0.026811,...,0.03356,0.047126,-0.036183,0.06875,0.023462,-0.030111,0.011272,-0.017171,0.016078,-0.022283
100002_raw-Energy,-0.043782,-0.012041,-0.011772,-0.006148,0.007011,0.018142,0.003144,0.018049,0.006926,0.038587,...,0.004833,0.022842,-0.009519,-0.000258,0.059764,-0.028394,-0.005967,0.045269,-0.007684,-0.01891
100003_raw-Protein,-0.021514,-0.028537,0.009441,0.007808,0.012707,0.021681,-0.006315,0.016129,7.6e-05,-0.001702,...,0.029704,0.029135,-0.056508,-0.002032,0.001189,-0.025507,-0.013012,0.037458,-0.009592,-0.016718
100004_raw-Fat,-0.030454,-0.052542,0.000459,-0.039613,0.006191,0.029523,0.000747,0.011876,-0.025758,0.025099,...,0.0159,0.016482,0.007409,-0.006833,0.036457,-0.034531,0.015365,0.023796,-0.017477,-0.005397
100005_raw-Carbohydrate,-0.017428,0.003757,-0.003708,-0.000929,-0.000647,-0.005729,0.02497,0.011531,0.035043,0.025159,...,-0.010071,0.002266,0.006664,0.00738,0.02994,-0.006989,0.014807,0.050208,0.005352,-0.049218


# UMAP

In [10]:
def get_umap_proj(orig_data, options):
    umap_obj = umap.UMAP(**{k:v for k, v in options.items() if k in DR_OPTIONS})
    umap_obj = umap_obj.fit(orig_data)
    umap_data = umap_obj.transform(orig_data)
    return pd.DataFrame(
        data=umap_data,
        index=orig_data.index.copy(),
        columns=[f'UMAP{i+1}' for i in range(umap_data.shape[1])]
    )

In [11]:
# for n_comp, n_neigh in product(DR_OPTIONS['n_components'], DR_OPTIONS['n_neighbors']):
for n_comp in DR_OPTIONS['n_components']:
    print(f'# components: {n_comp}')
    
    options = ALL_OPTIONS.copy()
    options['n_components'] = n_comp
    
    dr_data = get_umap_proj(data, options)
    
    display(dr_data.shape)
    assert dr_data.shape == (data.shape[0], n_comp)
    
    display(dr_data.iloc[:, 0:5].describe())
    
    # save
    output_file = Path(
        RESULTS_DIR,
        generate_result_set_name(
            options,
            prefix=f'umap-{input_filepath_stem}-',
            suffix='.pkl'
        )
    ).resolve()
    display(output_file)
    
    dr_data.to_pickle(output_file)
    
    print('\n')

# components: 5


(3749, 5)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,3749.0,3749.0,3749.0,3749.0,3749.0
mean,3.996364,-4.57915,-2.381185,12.718588,15.357991
std,0.82869,1.125089,0.73425,1.221331,0.908778
min,2.210563,-5.98532,-4.135514,5.086887,8.19432
25%,3.442145,-5.132524,-2.808691,12.422697,15.048211
50%,3.875299,-4.676681,-2.372222,12.952255,15.438498
75%,4.38828,-4.222258,-2.054847,13.364546,15.816606
max,8.131745,12.219553,4.790262,14.530914,17.804277


PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/umap/umap-projection-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_5-n_neighbors_15-random_state_0.pkl')



# components: 10


(3749, 10)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,3749.0,3749.0,3749.0,3749.0,3749.0
mean,11.846665,2.969445,-3.858104,1.500653,-4.276917
std,0.583781,0.635233,0.773101,0.54596,0.839233
min,7.827236,0.219349,-5.09868,0.102483,-5.6157
25%,11.582856,2.713638,-4.268599,1.184582,-4.645813
50%,11.881741,2.997579,-3.956902,1.480065,-4.41927
75%,12.160008,3.278105,-3.585745,1.744241,-4.092615
max,13.151178,11.667685,4.690622,5.284763,2.517114


PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/umap/umap-projection-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_10-n_neighbors_15-random_state_0.pkl')



# components: 20


(3749, 20)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,3749.0,3749.0,3749.0,3749.0,3749.0
mean,14.314087,10.125624,10.845999,4.331203,14.293452
std,0.832876,0.618109,0.523132,0.268324,0.60836
min,8.263475,-2.044916,4.283272,3.477882,9.230556
25%,14.247963,9.994162,10.68425,4.15835,14.186441
50%,14.468014,10.166414,10.921513,4.338503,14.359128
75%,14.65733,10.332847,11.112129,4.515208,14.5232
max,15.363819,10.943927,11.727321,5.537631,15.165653


PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/umap/umap-projection-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_20-n_neighbors_15-random_state_0.pkl')



# components: 30


(3749, 30)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,3749.0,3749.0,3749.0,3749.0,3749.0
mean,13.344429,10.320044,8.814381,8.12171,12.930243
std,0.760608,0.57375,0.344378,0.397372,0.536628
min,7.442815,-1.166084,4.930897,5.297506,8.356246
25%,13.298338,10.205273,8.655258,7.994777,12.856535
50%,13.479302,10.340881,8.795181,8.165555,12.985718
75%,13.620317,10.490021,8.951226,8.335352,13.112096
max,14.345589,10.957788,10.327756,9.123297,13.603683


PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/umap/umap-projection-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_30-n_neighbors_15-random_state_0.pkl')



# components: 40


(3749, 40)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,3749.0,3749.0,3749.0,3749.0,3749.0
mean,12.184085,7.44232,13.120358,8.30372,15.466315
std,0.596015,0.512163,0.538274,0.412523,0.758163
min,7.573236,-0.807962,7.162447,5.469018,7.731785
25%,12.112029,7.252896,13.070852,8.217155,15.435895
50%,12.28089,7.405071,13.246187,8.377634,15.609369
75%,12.412335,7.562565,13.354837,8.507201,15.740123
max,13.106312,9.429248,13.709147,9.210105,16.218624


PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/umap/umap-projection-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_40-n_neighbors_15-random_state_0.pkl')



# components: 50


(3749, 50)

Unnamed: 0,UMAP1,UMAP2,UMAP3,UMAP4,UMAP5
count,3749.0,3749.0,3749.0,3749.0,3749.0
mean,12.190821,9.693271,8.82104,5.151597,13.368401
std,0.574257,0.511677,0.297742,0.204645,0.511793
min,7.259162,-0.741623,4.879749,4.516759,8.868875
25%,12.11676,9.599879,8.706084,5.007544,13.277666
50%,12.264053,9.706082,8.825241,5.154793,13.428452
75%,12.401225,9.816405,8.944929,5.299308,13.564143
max,13.152287,10.181146,10.434216,5.804502,14.315068


PosixPath('/media/miltondp/Elements1/projects/phenoplier/results/data_transformations/umap/umap-projection-smultixcan-efo_partial-mashr-zscores-metric_euclidean-n_components_50-n_neighbors_15-random_state_0.pkl')



