# Description

It standardize (z-score) S-MultiXcan results projected into the MultiPLIER latent space.

# Modules loading

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
from IPython.display import display

import numpy as np
import pandas as pd
from sklearn.preprocessing import scale

import conf

# Settings

In [3]:
np.random.seed(0)

In [4]:
NULL_DIR = conf.RESULTS["CLUSTERING_NULL_DIR"] / "shuffle_genes"

## Input data

In [5]:
INPUT_FILEPATH = Path(
    NULL_DIR,
    "projections",
    "projection-smultixcan-efo_partial-mashr-zscores.pkl",
).resolve()
display(INPUT_FILEPATH)

input_filepath_stem = INPUT_FILEPATH.stem
display(input_filepath_stem)

PosixPath('/opt/data/results/clustering/null_sims/shuffle_genes/projections/projection-smultixcan-efo_partial-mashr-zscores.pkl')

'projection-smultixcan-efo_partial-mashr-zscores'

## Output folder

In [6]:
# output dir for this notebook
RESULTS_DIR = Path(NULL_DIR, "data_transformations", "z_score_std").resolve()
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

display(RESULTS_DIR)

PosixPath('/opt/data/results/clustering/null_sims/shuffle_genes/data_transformations/z_score_std')

# Load input file

In [7]:
data = pd.read_pickle(INPUT_FILEPATH).T

In [8]:
display(data.shape)

(3752, 987)

In [9]:
display(data.head())

Unnamed: 0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10,...,LV978,LV979,LV980,LV981,LV982,LV983,LV984,LV985,LV986,LV987
100001_raw-Food_weight,-0.004991,-0.02016,-0.02603,0.020068,0.011753,0.017012,0.049228,-0.027804,0.020122,-0.043951,...,-0.027612,0.011046,-0.014123,-0.026495,-0.034632,-0.007654,0.039811,0.001866,-0.026045,0.024954
100002_raw-Energy,0.015146,0.025381,0.03099,-0.004168,0.006254,0.01402,0.006114,-0.039952,-0.004144,0.029471,...,-0.0178,0.018595,-0.00306,-0.044417,0.021673,0.009405,-0.01902,0.042079,0.016351,0.032201
100003_raw-Protein,0.011558,0.026267,-0.00883,0.023151,-0.0147,0.015522,0.010072,0.005893,0.030038,-0.053161,...,-0.01448,0.008231,-0.004317,-0.038221,0.008717,0.017687,0.01466,-0.003575,-0.024418,-0.032628
100004_raw-Fat,-0.004454,0.025022,0.010287,-0.045471,0.035624,0.054628,-0.015242,-0.031907,0.007947,0.024257,...,-0.012052,-0.03622,-0.009773,0.024453,-0.020112,0.032634,-0.0005,0.010128,-0.000518,0.051511
100005_raw-Carbohydrate,-0.004064,-0.014816,0.027811,0.015872,-0.012119,-0.03521,-0.035111,0.026068,0.03148,0.001849,...,0.044601,-0.002078,-0.002292,-0.038174,-0.012224,0.012797,0.048189,-0.026752,0.0008,0.019285


# z-score standardization

In [10]:
data_stats = data.iloc[:, :10].describe()
display(data_stats)

Unnamed: 0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10
count,3752.0,3752.0,3752.0,3752.0,3752.0,3752.0,3752.0,3752.0,3752.0,3752.0
mean,4.260984e-18,2.130492e-18,-1.006066e-18,2.308033e-18,-3.5508199999999997e-19,-1.242787e-18,4.231394e-18,3.402869e-18,-4.438525e-19,-2.441189e-18
std,0.02851547,0.02839185,0.02601169,0.02896133,0.02871084,0.02646271,0.02785,0.02597797,0.02486356,0.02826685
min,-0.1120355,-0.1015075,-0.0852402,-0.1071991,-0.08668615,-0.121563,-0.09090851,-0.08776256,-0.09062002,-0.08284056
25%,-0.01855408,-0.01926075,-0.01722247,-0.01968283,-0.02006003,-0.018104,-0.0186363,-0.0176906,-0.01649458,-0.01952048
50%,-0.0008137997,-0.0008927103,-0.000145856,-0.000120066,-0.001103916,0.0001469371,-0.001021344,-0.0009058482,-7.845392e-05,-0.0002889262
75%,0.01803097,0.01816413,0.01684615,0.01861076,0.01829824,0.01733025,0.01766101,0.01689003,0.01608497,0.01776515
max,0.1318461,0.1379319,0.2187188,0.1073417,0.1785174,0.1469357,0.1004747,0.1862624,0.1982892,0.1460073


In [11]:
scaled_data = pd.DataFrame(
    data=scale(data), index=data.index.copy(), columns=data.columns.copy()
)

In [12]:
display(scaled_data.shape)

(3752, 987)

In [13]:
display(scaled_data.head())

Unnamed: 0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10,...,LV978,LV979,LV980,LV981,LV982,LV983,LV984,LV985,LV986,LV987
100001_raw-Food_weight,-0.175043,-0.710171,-1.000846,0.693026,0.409413,0.642958,1.767862,-1.070425,0.809395,-1.55505,...,-0.932802,0.371349,-0.521787,-0.881136,-1.193309,-0.257883,1.382268,0.066549,-0.866086,0.847997
100002_raw-Energy,0.531227,0.894062,1.191552,-0.143922,0.217867,0.529878,0.219561,-1.538143,-0.166704,1.042749,...,-0.601329,0.625139,-0.113048,-1.477163,0.746773,0.316892,-0.660407,1.501067,0.543744,1.094294
100003_raw-Protein,0.405395,0.925288,-0.339507,0.799476,-0.512075,0.586632,0.361706,0.22686,1.208289,-1.880931,...,-0.489171,0.276717,-0.159505,-1.271106,0.300351,0.595915,0.50902,-0.12754,-0.811987,-1.108812
100004_raw-Fat,-0.15623,0.881426,0.395527,-1.570256,1.240955,2.064595,-0.547373,-1.228389,0.319684,0.858253,...,-0.407151,-1.217662,-0.361052,0.813221,-0.692981,1.099523,-0.017366,0.361278,-0.017222,1.750511
100005_raw-Carbohydrate,-0.142531,-0.521907,1.069304,0.548122,-0.422146,-1.330718,-1.260872,1.003617,1.266289,0.065408,...,1.506698,-0.069854,-0.084662,-1.269536,-0.421185,0.43118,1.673162,-0.954336,0.026593,0.655359


In [14]:
scaled_data_stats = scaled_data.iloc[:, :10].describe()
display(scaled_data_stats)

Unnamed: 0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10
count,3752.0,3752.0,3752.0,3752.0,3752.0,3752.0,3752.0,3752.0,3752.0,3752.0
mean,1.893771e-18,1.0415740000000001e-17,-1.5150160000000002e-17,2.2725250000000003e-17,-1.0415740000000001e-17,-1.8937710000000002e-17,-8.99541e-18,2.1778360000000002e-17,-1.4203280000000003e-17,-1.230951e-17
std,1.000133,1.000133,1.000133,1.000133,1.000133,1.000133,1.000133,1.000133,1.000133,1.000133
min,-3.929463,-3.575712,-3.277433,-3.701949,-3.019686,-4.594361,-3.264654,-3.378796,-3.645178,-2.931052
25%,-0.6507538,-0.6784805,-0.6621935,-0.6797152,-0.698785,-0.6842237,-0.6692562,-0.6810757,-0.663492,-0.6906704
50%,-0.02854269,-0.03144668,-0.005608072,-0.004146288,-0.03845457,0.005553349,-0.03667792,-0.03487451,-0.003155798,-0.01022274
75%,0.6324069,0.639851,0.6477239,0.6426929,0.6374135,0.6549806,0.6342319,0.6502541,0.6470156,0.6285638
max,4.624285,4.858798,8.409602,3.706873,6.218598,5.553296,3.60819,7.170969,7.976157,5.166008


## Testing

In [15]:
assert np.all(
    [
        np.isclose(scaled_data_stats.loc["mean", c], 0.0)
        for c in scaled_data_stats.columns
    ]
)

In [16]:
assert np.all(
    [
        np.isclose(scaled_data_stats.loc["std", c], 1.0, atol=1e-03)
        for c in scaled_data_stats.columns
    ]
)

# Save

In [17]:
output_file = Path(
    RESULTS_DIR,
    f"z_score_std-{input_filepath_stem}.pkl",
).resolve()

display(output_file)

PosixPath('/opt/data/results/clustering/null_sims/shuffle_genes/data_transformations/z_score_std/z_score_std-projection-smultixcan-efo_partial-mashr-zscores.pkl')

In [18]:
scaled_data.to_pickle(output_file)