# Description

It projects input data into a UMAP representation.

# Modules loading

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from pathlib import Path
from IPython.display import display

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import conf
from utils import generate_result_set_name

# Settings

## Input data

In [None]:
INPUT_FILEPATH_STEM = 'projection-smultixcan-efo_partial-mashr-zscores'

In [None]:
INPUT_FILEPATH = Path(
    conf.RESULTS["DATA_TRANSFORMATIONS_DIR"],
    'z_score_std',
    f'z_score_std-{INPUT_FILEPATH_STEM}.pkl',
).resolve()
display(INPUT_FILEPATH)

input_filepath_stem = INPUT_FILEPATH.stem
display(input_filepath_stem)

## Output folder

In [None]:
# output dir for this notebook
RESULTS_DIR = Path(
    conf.RESULTS["DATA_TRANSFORMATIONS_DIR"],
    'umap'
).resolve()
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

display(RESULTS_DIR)

## UMAP options

In [None]:
# parameters of the dimentionality reduction steps
# note that these are the default parameters of UMAP (metric and n_neighbors)
DR_OPTIONS = {
    'n_components': [5, 10, 20, 30, 40, 50],
    'metric': 'euclidean',
    'n_neighbors': 15,
    'random_state': 0,
}

In [None]:
# dictionary containing all options/settings (used to generate filenames)
ALL_OPTIONS = DR_OPTIONS.copy()

display(ALL_OPTIONS)

# Load input file

In [None]:
data = pd.read_pickle(INPUT_FILEPATH)

In [None]:
display(data.shape)

In [None]:
display(data.head())

# UMAP

**Reproducibility problem**: there seems to be a bug with UMAP in which it produces different results in different operating systems or machines: https://github.com/lmcinnes/umap/issues/153

In [None]:
from data.dimreduction import get_umap_proj

In [None]:
# Get a UMAP representation for all n_components configurations
for n_comp in DR_OPTIONS['n_components']:
    print(f'# components: {n_comp}')
    
    # prepare options of n_comp
    options = ALL_OPTIONS.copy()
    options['n_components'] = n_comp
    options = {k:v for k, v in options.items() if k in DR_OPTIONS}
    
    # get projection
    dr_data = get_umap_proj(data, options)
    
    # check data dimensionality
    display(dr_data.shape)
    assert dr_data.shape == (data.shape[0], n_comp)
    
    display(dr_data.iloc[:, 0:5].describe())
    
    # save
    output_file = Path(
        RESULTS_DIR,
        generate_result_set_name(
            options,
            prefix=f'umap-{input_filepath_stem}-',
            suffix='.pkl'
        )
    ).resolve()
    display(output_file)
    
    dr_data.to_pickle(output_file)
    
    print('\n')