# Description

It standardize (z-score) S-MultiXcan results projected into the MultiPLIER latent space.

# Modules loading

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
from IPython.display import display

import numpy as np
import pandas as pd
from sklearn.preprocessing import scale

import conf

# Settings

In [3]:
np.random.seed(0)

In [4]:
NULL_DIR = conf.RESULTS["CLUSTERING_NULL_DIR"] / "shuffle_lvs"

## Input data

In [5]:
INPUT_FILEPATH = Path(
    NULL_DIR,
    "projections",
    "projection-smultixcan-efo_partial-mashr-zscores.pkl",
).resolve()
display(INPUT_FILEPATH)

input_filepath_stem = INPUT_FILEPATH.stem
display(input_filepath_stem)

PosixPath('/opt/data/results/clustering/null_sims/shuffle_lvs/projections/projection-smultixcan-efo_partial-mashr-zscores.pkl')

'projection-smultixcan-efo_partial-mashr-zscores'

## Output folder

In [6]:
# output dir for this notebook
RESULTS_DIR = Path(NULL_DIR, "data_transformations", "z_score_std").resolve()
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

display(RESULTS_DIR)

PosixPath('/opt/data/results/clustering/null_sims/shuffle_lvs/data_transformations/z_score_std')

# Load input file

In [7]:
data = pd.read_pickle(INPUT_FILEPATH).T

In [8]:
display(data.shape)

(3752, 987)

In [9]:
display(data.head())

Unnamed: 0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10,...,LV978,LV979,LV980,LV981,LV982,LV983,LV984,LV985,LV986,LV987
100001_raw-Food_weight,-0.073679,-0.052828,0.041971,0.004956,-0.007681,-0.010081,0.05473,-0.021652,-0.054424,-0.001761,...,0.012072,0.027118,-0.018594,0.019354,0.013125,-0.027029,0.049547,0.008668,0.018525,0.016905
100002_raw-Energy,-0.015465,0.016444,-0.023914,-0.002509,-0.00076,0.005859,0.040347,0.003794,-0.01063,-0.050057,...,-0.001388,-0.033729,-0.029717,0.036433,0.028141,-0.02549,0.027895,-0.021836,-0.002272,-0.036747
100003_raw-Protein,-0.008602,-0.009284,-0.042691,0.037642,0.010101,-0.079309,-0.005715,-0.021619,-0.017458,0.036159,...,0.030732,0.019324,0.054912,-0.010213,0.012643,-0.006422,-0.057273,-0.003816,0.039237,0.023237
100004_raw-Fat,-0.012982,-0.00228,-0.010464,-0.037028,-0.008053,-0.007899,-0.019534,0.008677,-0.026418,-0.030625,...,0.007484,-0.04539,0.00098,0.015655,0.016314,-0.03342,0.009053,-0.05235,-0.0297,-0.079015
100005_raw-Carbohydrate,-0.045931,-0.032513,0.016177,0.01015,-0.038093,-0.027577,-0.001228,0.017367,-0.024227,-0.011707,...,-0.017966,-0.034462,-0.000168,-0.010531,-0.049522,0.005111,0.018046,0.035997,-0.0482,-0.001093


# z-score standardization

In [10]:
data_stats = data.iloc[:, :10].describe()
display(data_stats)

Unnamed: 0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10
count,3752.0,3752.0,3752.0,3752.0,3752.0,3752.0,3752.0,3752.0,3752.0,3752.0
mean,0.00012,0.000679,9.4e-05,0.000226,-0.000352,-0.00034,-0.00113,0.000214,-0.00083,-0.000437
std,0.030651,0.030651,0.030284,0.029805,0.030425,0.031951,0.030644,0.030531,0.030294,0.03024
min,-0.096689,-0.102214,-0.100443,-0.090642,-0.093071,-0.089517,-0.116242,-0.096011,-0.099658,-0.093819
25%,-0.020699,-0.020577,-0.020308,-0.020856,-0.021171,-0.022392,-0.022418,-0.020228,-0.021644,-0.021053
50%,-0.000942,-0.00059,-0.00132,-0.000986,-0.001746,-0.001618,-0.002234,-0.000775,-0.002533,-0.002049
75%,0.019709,0.020179,0.019134,0.019199,0.019426,0.019117,0.018795,0.018845,0.018575,0.018473
max,0.156729,0.159284,0.152689,0.117356,0.132031,0.37582,0.262052,0.179894,0.118494,0.173786


In [11]:
scaled_data = pd.DataFrame(
    data=scale(data), index=data.index.copy(), columns=data.columns.copy()
)

In [12]:
display(scaled_data.shape)

(3752, 987)

In [13]:
display(scaled_data.head())

Unnamed: 0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10,...,LV978,LV979,LV980,LV981,LV982,LV983,LV984,LV985,LV986,LV987
100001_raw-Food_weight,-2.408007,-1.745911,1.383013,0.158718,-0.240933,-0.304911,1.823117,-0.716278,-1.769346,-0.043794,...,0.399227,0.849885,-0.593292,0.636568,0.409726,-0.871713,1.63877,0.285165,0.585307,0.592347
100002_raw-Energy,-0.508508,0.514427,-0.792865,-0.091775,-0.013409,0.194047,1.353713,0.117286,-0.323516,-1.641116,...,-0.038535,-1.139394,-0.951591,1.197872,0.893819,-0.820655,0.928634,-0.716778,-0.085735,-1.208965
100003_raw-Protein,-0.284597,-0.325076,-1.41298,1.25551,0.343603,-2.471878,-0.14964,-0.715178,-0.548948,1.210352,...,1.006094,0.595064,1.774559,-0.335165,0.394177,-0.188345,-1.864716,-0.124875,1.253625,0.804944
100004_raw-Fat,-0.427503,-0.096524,-0.348654,-1.250087,-0.253145,-0.236606,-0.600627,0.277239,-0.844748,-0.998437,...,0.249993,-1.52066,0.037241,0.514992,0.512538,-1.083619,0.310643,-1.719033,-0.97077,-2.628077
100005_raw-Carbohydrate,-1.502621,-1.083034,0.531159,0.332981,-1.24064,-0.852565,-0.003171,0.561912,-0.772413,-0.372739,...,-0.57773,-1.16338,0.000251,-0.345607,-1.610004,0.194078,0.605594,1.182825,-1.567705,-0.011902


In [14]:
scaled_data_stats = scaled_data.iloc[:, :10].describe()
display(scaled_data_stats)

Unnamed: 0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10
count,3752.0,3752.0,3752.0,3752.0,3752.0,3752.0,3752.0,3752.0,3752.0,3752.0
mean,-3.7875410000000005e-17,-5.302558e-17,2.130492e-17,-2.580262e-17,-1.0415740000000001e-17,-1.988459e-17,1.344577e-16,1.609705e-17,-2.2725250000000003e-17,-6.628197e-18
std,1.000133,1.000133,1.000133,1.000133,1.000133,1.000133,1.000133,1.000133,1.000133,1.000133
min,-3.158831,-3.357375,-3.320259,-3.049144,-3.047879,-2.791399,-3.756903,-3.152098,-3.262723,-3.088463
25%,-0.6793041,-0.693576,-0.67378,-0.707425,-0.6843905,-0.690273,-0.694754,-0.6696146,-0.6871465,-0.6818319
50%,-0.03464282,-0.0414021,-0.04666978,-0.04066792,-0.04583165,-0.04000847,-0.03600796,-0.03239147,-0.05621995,-0.05331637
75%,0.6391933,0.6362952,0.6288178,0.636628,0.6501193,0.609045,0.6503046,0.6103173,0.6406675,0.6254112
max,5.110108,5.17526,5.039507,3.930339,4.351697,11.77458,8.589505,5.885921,3.939406,5.762166


## Testing

In [15]:
assert np.all(
    [
        np.isclose(scaled_data_stats.loc["mean", c], 0.0)
        for c in scaled_data_stats.columns
    ]
)

In [16]:
assert np.all(
    [
        np.isclose(scaled_data_stats.loc["std", c], 1.0, atol=1e-03)
        for c in scaled_data_stats.columns
    ]
)

# Save

In [17]:
output_file = Path(
    RESULTS_DIR,
    f"z_score_std-{input_filepath_stem}.pkl",
).resolve()

display(output_file)

PosixPath('/opt/data/results/clustering/null_sims/shuffle_lvs/data_transformations/z_score_std/z_score_std-projection-smultixcan-efo_partial-mashr-zscores.pkl')

In [18]:
scaled_data.to_pickle(output_file)