# Description

It standardizes the features (latent variables) of an input file.

# Modules loading

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
from IPython.display import display

import numpy as np
import pandas as pd
from sklearn.preprocessing import scale

import conf

# Settings

In [3]:
INPUT_FILEPATH_STEM = 'smultixcan-efo_partial-mashr-zscores'

In [4]:
INPUT_FILEPATH = Path(
    conf.RESULTS["DATA_TRANSFORMATIONS_DIR"],
    'traits_selections',
    f'diseases_only-{INPUT_FILEPATH_STEM}.pkl',
).resolve()
display(INPUT_FILEPATH)

input_filepath_stem = INPUT_FILEPATH.stem
display(input_filepath_stem)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/data_transformations/traits_selections/diseases_only-smultixcan-efo_partial-mashr-zscores.pkl')

'diseases_only-smultixcan-efo_partial-mashr-zscores'

In [5]:
# output dir for this notebook
RESULTS_DIR = Path(
    conf.RESULTS["DATA_TRANSFORMATIONS_DIR"],
    'z_score_std'
).resolve()
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

display(RESULTS_DIR)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/data_transformations/z_score_std')

# Load input file

In [6]:
data = pd.read_pickle(INPUT_FILEPATH)

In [7]:
data.shape

(538, 22515)

In [8]:
data.head()

gene_name,ENSG00000000419,ENSG00000000457,ENSG00000000460,ENSG00000000938,ENSG00000000971,ENSG00000001036,ENSG00000001084,ENSG00000001167,ENSG00000001460,ENSG00000001461,...,ENSG00000284240,ENSG00000284308,ENSG00000284395,ENSG00000284413,ENSG00000284418,ENSG00000284430,ENSG00000284452,ENSG00000284513,ENSG00000284526,ENSG00000284552
vascular disease AND intestinal disease,1.463849,0.295149,0.862394,0.64243,2.560404,0.276682,1.25269,0.138074,0.253713,0.13149,...,0.238496,1.582408,0.470057,1.212982,1.320907,1.348926,0.925907,0.648023,3.172445,0.658885
"osteoarthritis, knee",1.139573,0.339193,0.063113,0.673883,1.480963,0.029437,0.738344,0.597559,0.310168,0.262036,...,0.144221,0.558024,1.497799,3.391703,0.745621,0.681965,0.577402,0.41246,0.24563,1.40189
carpal tunnel syndrome,0.91841,0.592007,0.221104,1.875068,0.427498,1.564119,1.739466,0.369079,0.795678,0.811066,...,0.326533,1.521576,0.307704,0.155117,1.459384,0.813065,0.526617,0.618006,0.384546,1.114975
gastritis,1.616585,0.675119,0.847838,0.976489,0.275587,0.13709,0.246508,1.27692,0.36882,0.205764,...,2.220743,0.934335,0.858603,0.42243,0.221982,0.083101,0.909691,0.539585,1.158575,0.101605
neoplasm,1.445983,2.245419,1.185771,1.236719,0.298348,0.987865,0.670254,0.563399,1.70164,1.710727,...,1.933531,0.586532,0.324582,0.557693,0.986534,1.205885,1.061329,0.743238,0.591596,0.730569


# Data preprocessing for clustering

In [9]:
data_stats = data.iloc[:, :10].describe()
display(data_stats)

gene_name,ENSG00000000419,ENSG00000000457,ENSG00000000460,ENSG00000000938,ENSG00000000971,ENSG00000001036,ENSG00000001084,ENSG00000001167,ENSG00000001460,ENSG00000001461
count,538.0,538.0,538.0,538.0,538.0,538.0,538.0,538.0,538.0,538.0
mean,0.876279,0.949519,0.879537,0.990397,0.952745,0.941189,0.902247,0.854361,0.882128,0.917427
std,0.614019,0.734327,0.700223,0.727093,0.683013,0.664107,0.635629,0.613803,0.686745,0.696687
min,0.005327,0.000509,0.00014,0.006677,2e-06,0.00955,0.00154,0.001177,0.002978,0.002426
25%,0.416252,0.393362,0.359201,0.423119,0.429098,0.393187,0.451304,0.362337,0.347634,0.362717
50%,0.769703,0.768482,0.739565,0.829347,0.822457,0.837543,0.772877,0.751141,0.7153,0.754837
75%,1.228141,1.354872,1.189649,1.393648,1.359572,1.364613,1.246128,1.255927,1.224353,1.291462
max,3.438081,4.628054,5.508381,3.753905,3.274207,3.57449,4.155087,3.7304,3.592585,3.951277


## Standardize

In [10]:
scaled_data = pd.DataFrame(
    data=scale(data),
    index=data.index.copy(),
    columns=data.columns.copy()
)

In [11]:
scaled_data.shape

(538, 22515)

In [12]:
scaled_data.head()

gene_name,ENSG00000000419,ENSG00000000457,ENSG00000000460,ENSG00000000938,ENSG00000000971,ENSG00000001036,ENSG00000001084,ENSG00000001167,ENSG00000001460,ENSG00000001461,...,ENSG00000284240,ENSG00000284308,ENSG00000284395,ENSG00000284413,ENSG00000284418,ENSG00000284430,ENSG00000284452,ENSG00000284513,ENSG00000284526,ENSG00000284552
vascular disease AND intestinal disease,0.957815,-0.891945,-0.024504,-0.479018,2.355967,-1.001533,0.551846,-1.168051,-0.915916,-1.129156,...,-1.08469,1.137969,-0.681588,0.417068,0.756573,0.925027,0.087152,-0.361709,3.831066,-0.383482
"osteoarthritis, knee",0.429203,-0.831911,-1.167032,-0.435719,0.774084,-1.374176,-0.2581,-0.418768,-0.833632,-0.9416,...,-1.230954,-0.536561,0.750267,3.640816,-0.193611,-0.211389,-0.484246,-0.760691,-0.98461,0.793113
carpal tunnel syndrome,0.068678,-0.48731,-0.941193,1.217856,-0.76973,0.938871,1.318377,-0.791351,-0.126,-0.152809,...,-0.948103,1.038529,-0.907778,-1.148204,0.985292,0.011989,-0.567512,-0.41255,-0.756042,0.338765
gastritis,1.206793,-0.374024,-0.045312,-0.019145,-0.99235,-1.211925,-1.032599,0.68907,-0.748146,-1.022447,...,1.990701,0.078584,-0.140263,-0.752674,-1.058492,-1.231776,0.060565,-0.545375,0.517516,-1.26597
neoplasm,0.92869,1.766387,0.437744,0.339092,-0.958995,0.07035,-0.365322,-0.474472,1.19444,1.139735,...,1.545101,-0.489959,-0.884264,-0.552531,0.204299,0.681304,0.309185,-0.20044,-0.41537,-0.269966


In [13]:
scaled_data_stats = scaled_data.iloc[:,:10].describe()
display(scaled_data_stats)

gene_name,ENSG00000000419,ENSG00000000457,ENSG00000000460,ENSG00000000938,ENSG00000000971,ENSG00000001036,ENSG00000001084,ENSG00000001167,ENSG00000001460,ENSG00000001461
count,538.0,538.0,538.0,538.0,538.0,538.0,538.0,538.0,538.0,538.0
mean,-1.09619e-15,-1.565043e-15,-4.62249e-16,-4.62249e-17,-1.525422e-15,-5.282846e-16,5.745095e-16,-3.037636e-16,-3.367814e-16,-7.594091e-17
std,1.000931,1.000931,1.000931,1.000931,1.000931,1.000931,1.000931,1.000931,1.000931,1.000931
min,-1.419765,-1.293557,-1.257049,-1.354207,-1.39621,-1.404151,-1.418352,-1.391291,-1.281362,-1.314582
25%,-0.7499039,-0.758076,-0.7437905,-0.7809255,-0.7673854,-0.8259386,-0.7101051,-0.802346,-0.7790248,-0.7969525
50%,-0.1737322,-0.2467648,-0.2000814,-0.2217044,-0.1909317,-0.1562122,-0.2037206,-0.1683214,-0.2431525,-0.2335925
75%,0.5735793,0.55252,0.443288,0.5551237,0.5961907,0.6381776,0.5415137,0.654835,0.4987937,0.5373771
max,4.176067,5.01406,6.616677,3.804299,3.40202,3.968868,5.12228,4.68997,3.950493,4.358734


## Testing

In [14]:
assert np.all([np.isclose(scaled_data_stats.loc['mean', c], 0.0) for c in scaled_data_stats.columns])

In [15]:
assert np.all([np.isclose(scaled_data_stats.loc['std', c], 1.0, atol=1e-03) for c in scaled_data_stats.columns])

# Save

In [16]:
output_file = Path(
    RESULTS_DIR,
    f'z_score_std-{input_filepath_stem}.pkl',
).resolve()

display(output_file)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/data_transformations/z_score_std/z_score_std-diseases_only-smultixcan-efo_partial-mashr-zscores.pkl')

In [17]:
scaled_data.to_pickle(output_file)