# Description

It standardize (z-score) eMERGE S-MultiXcan results projected into the MultiPLIER latent space.

# Environment variables

In [1]:
from IPython.display import display

import conf

N_JOBS = conf.GENERAL["N_JOBS"]
display(N_JOBS)

4

In [2]:
%env MKL_NUM_THREADS=$N_JOBS
%env OPEN_BLAS_NUM_THREADS=$N_JOBS
%env NUMEXPR_NUM_THREADS=$N_JOBS
%env OMP_NUM_THREADS=$N_JOBS

env: MKL_NUM_THREADS=4
env: OPEN_BLAS_NUM_THREADS=4
env: NUMEXPR_NUM_THREADS=4
env: OMP_NUM_THREADS=4


# Modules loading

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.preprocessing import scale

# Settings

## Input data

## Output folder

In [5]:
# output dir for this notebook
RESULTS_DIR = Path(conf.RESULTS["DATA_TRANSFORMATIONS_DIR"], "z_score_std").resolve()
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

display(RESULTS_DIR)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/data_transformations/z_score_std')

# Load eMERGE data

## Projection of S-MultiXcan z-scores

In [19]:
input_file = Path(
    conf.RESULTS["PROJECTIONS_DIR"],
    "projection-emerge-smultixcan-mashr-zscores.pkl",
).resolve()
display(input_file)
assert input_file.exists()

input_filepath_stem = input_file.stem
display(input_filepath_stem)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/projections/projection-emerge-smultixcan-mashr-zscores.pkl')

'projection-emerge-smultixcan-mashr-zscores'

In [20]:
pmbb_data = pd.read_pickle(input_file)

In [21]:
pmbb_data.shape

(987, 309)

In [22]:
pmbb_data.head()

Unnamed: 0,292.3,079,741,418.1,280,747,591,426.21,244,070.3,...,427,585.4,345,366.2,420.2,195,250.2,250.1,371,562
LV1,0.054491,-0.020097,0.022097,0.011815,-0.027529,-0.044248,0.022966,-0.052994,0.049973,0.00023,...,-0.03238,0.012214,0.006572,-0.024007,-0.006435,0.05631,0.072363,0.039763,-0.022529,0.03441
LV2,-0.045586,-0.009249,0.037598,-0.017011,-0.033393,-0.018336,0.007075,-0.011952,-0.002291,0.029972,...,0.018297,0.037166,-0.011099,-0.012876,-0.055898,0.002411,0.021665,0.011047,0.003468,0.035322
LV3,0.024822,-0.060301,-0.029168,-0.034559,0.006261,0.022122,-0.006224,0.000662,0.036817,-7.7e-05,...,0.03856,-0.038163,0.003248,-0.041152,0.014292,-0.03949,0.009368,0.013476,0.044817,0.004777
LV4,-0.039762,0.012785,-0.0037,0.028548,-0.029743,-0.00373,0.0356,-0.022699,0.005748,0.037,...,-0.029203,0.07504,0.042023,-0.027431,0.004113,0.028081,0.008579,0.017368,-0.073883,-0.020724
LV5,-0.012542,-0.003956,-0.026013,0.012478,0.007327,0.024376,-0.000232,0.063066,0.044968,-0.002194,...,0.030944,-0.035498,-0.013057,-0.006033,-0.035562,-0.009096,-0.006855,-0.036068,0.002163,0.006373


# z-score standardization

In [23]:
data = pmbb_data.T

In [24]:
data_stats = data.iloc[:, :10].describe()
display(data_stats)

Unnamed: 0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10
count,309.0,309.0,309.0,309.0,309.0,309.0,309.0,309.0,309.0,309.0
mean,1.257534e-18,5.748728e-18,-4.131898e-18,6.916438e-18,2.55998e-18,-4.491192999999999e-19,6.287671e-18,2.874364e-18,-1.3922700000000001e-18,4.670841e-18
std,0.03003813,0.02772754,0.02732986,0.02901664,0.02987249,0.02819247,0.03052889,0.02689039,0.02561742,0.03038666
min,-0.08840341,-0.07784959,-0.07319803,-0.07790815,-0.07406592,-0.06579391,-0.09306557,-0.05959812,-0.06227625,-0.09811341
25%,-0.02009651,-0.01725108,-0.01661498,-0.0205009,-0.02057211,-0.01709499,-0.01863882,-0.01849178,-0.01736913,-0.02025966
50%,-0.0008071688,0.001684139,0.001123537,-0.0001685349,-0.002194497,0.0001454591,-0.001634808,-0.0002537379,0.0002284704,0.001140763
75%,0.02028215,0.01706551,0.01770317,0.01768648,0.01670923,0.02064828,0.01848979,0.01874173,0.0174115,0.01677429
max,0.08298141,0.07506923,0.08396437,0.07752567,0.1013489,0.0867563,0.08158373,0.09465645,0.09108183,0.1247821


In [25]:
scaled_data = pd.DataFrame(
    data=scale(data), index=data.index.copy(), columns=data.columns.copy()
)

In [26]:
display(scaled_data.shape)

(309, 987)

In [27]:
display(scaled_data.head())

Unnamed: 0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10,...,LV978,LV979,LV980,LV981,LV982,LV983,LV984,LV985,LV986,LV987
292.3,1.816988,-1.646752,0.909722,-1.372557,-0.420517,0.967264,-0.214345,3.525794,0.424954,1.303278,...,-0.169303,1.105963,0.624641,-0.498163,-0.471079,-2.176723,0.099151,-0.409419,-0.738851,-1.034009
79.0,-0.670119,-0.33409,-2.209989,0.441335,-0.132648,-0.564409,-0.442529,0.290681,0.442016,-1.673754,...,0.245237,2.406511,-0.347709,-0.374762,0.918071,-0.162533,0.737962,-0.843397,-0.419225,-0.872887
741.0,0.736815,1.358195,-1.068975,-0.127715,-0.872216,-2.209448,-0.444954,-0.951899,0.934908,0.153968,...,1.613549,1.186504,0.13792,-0.929207,-0.935787,0.652984,-0.616765,-1.587159,0.011447,-1.446751
418.1,0.393961,-0.614508,-1.266554,0.985434,0.418391,-0.116737,-0.318609,1.531675,-0.41989,0.479685,...,-0.033526,0.242118,0.361197,-0.872287,0.504552,0.896717,-0.335216,-0.037683,-0.459028,-0.164652
280.0,-0.917954,-1.206278,0.229449,-1.026703,0.24566,0.118361,-1.887569,-1.181531,-0.804771,-0.181034,...,-0.52772,-1.344066,0.02114,-0.026648,0.354607,0.122197,-0.003083,-0.922225,1.436063,-0.522746


In [28]:
scaled_data_stats = scaled_data.iloc[:, :10].describe()
display(scaled_data_stats)

Unnamed: 0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10
count,309.0,309.0,309.0,309.0,309.0,309.0,309.0,309.0,309.0,309.0
mean,2.2994910000000002e-17,-4.8864180000000006e-17,2.012055e-17,-2.012055e-17,5.748728000000001e-17,-1.4371820000000002e-17,-2.874364e-18,-2.874364e-18,-1.868336e-17,5.748728e-18
std,1.001622,1.001622,1.001622,1.001622,1.001622,1.001622,1.001622,1.001622,1.001622,1.001622
min,-2.947813,-2.812217,-2.682661,-2.689303,-2.483424,-2.337526,-3.053387,-2.21993,-2.434955,-3.234069
25%,-0.6701185,-0.6231734,-0.6089284,-0.7076684,-0.6897812,-0.6073509,-0.6115207,-0.6887878,-0.6791201,-0.6678102
50%,-0.02691506,0.06083738,0.0411769,-0.005817637,-0.07358131,0.005167871,-0.05363641,-0.00945131,0.008933025,0.03760246
75%,0.6763088,0.6164699,0.6488097,0.6105177,0.5602589,0.7335921,0.6066312,0.6980981,0.6807766,0.5529233
max,2.767016,2.71178,3.077241,2.6761,3.39822,3.082278,2.67668,3.525794,3.561232,4.113137


## Testing

In [29]:
assert np.all(
    [
        np.isclose(scaled_data_stats.loc["mean", c], 0.0)
        for c in scaled_data_stats.columns
    ]
)

In [30]:
assert np.all(
    [
        np.isclose(scaled_data_stats.loc["std", c], 1.0, atol=1e-02)
        for c in scaled_data_stats.columns
    ]
)

# Save

In [31]:
output_file = Path(
    RESULTS_DIR,
    f"z_score_std-{input_filepath_stem}.pkl",
).resolve()

display(output_file)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/data_transformations/z_score_std/z_score_std-projection-emerge-smultixcan-mashr-zscores.pkl')

In [32]:
scaled_data.to_pickle(output_file)